[AMDGPU] Don't cluster stores

Clustering loads has caching benefits, but as far as I know there is no
advantage to clustering stores on any AMDGPU subtargets.

The disadvantage is that it tends to increase register pressure and
restricts scheduling freedom.

Differential Revision: https://reviews.llvm.org/D85530
This commit is contained in:
Jay Foad 2020-06-03 10:01:12 +01:00
parent 98eaacd73d
commit c799f873cb
26 changed files with 1591 additions and 1600 deletions

View File

@ -283,7 +283,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@ -294,7 +293,6 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@ -308,7 +306,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
@ -604,7 +601,6 @@ public:
createMachineScheduler(MachineSchedContext *C) const override {
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}

View File

@ -10,362 +10,364 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: s_add_u32 s0, s0, s7
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GCN-NEXT: v_mov_b32_e32 v0, 0x100
; GCN-NEXT: v_mov_b32_e32 v16, 0x100
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_add_u32_e32 v1, 4, v0
; GCN-NEXT: v_add_u32_e32 v31, 64, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0
; GCN-NEXT: s_load_dwordx16 s[68:83], s[10:11], 0x40
; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x80
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0
; GCN-NEXT: s_movk_i32 s4, 0x50
; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80
; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16
; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, s13
; GCN-NEXT: v_mov_b32_e32 v5, s14
; GCN-NEXT: v_mov_b32_e32 v6, s15
; GCN-NEXT: v_mov_b32_e32 v8, s16
; GCN-NEXT: v_mov_b32_e32 v10, s17
; GCN-NEXT: v_mov_b32_e32 v12, s18
; GCN-NEXT: v_mov_b32_e32 v14, s19
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
; GCN-NEXT: v_mov_b32_e32 v4, s16
; GCN-NEXT: v_mov_b32_e32 v5, s17
; GCN-NEXT: v_mov_b32_e32 v6, s18
; GCN-NEXT: v_mov_b32_e32 v7, s19
; GCN-NEXT: v_mov_b32_e32 v8, s20
; GCN-NEXT: v_mov_b32_e32 v9, s21
; GCN-NEXT: v_mov_b32_e32 v10, s22
; GCN-NEXT: v_mov_b32_e32 v11, s23
; GCN-NEXT: v_mov_b32_e32 v12, s24
; GCN-NEXT: v_mov_b32_e32 v13, s25
; GCN-NEXT: v_mov_b32_e32 v14, s26
; GCN-NEXT: v_mov_b32_e32 v15, s27
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256
; GCN-NEXT: v_add_u32_e32 v0, 4, v16
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s52
; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s53
; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s54
; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s4, 0x50
; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s55
; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v35, s4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s56
; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v36, 0x54, v16
; GCN-NEXT: v_mov_b32_e32 v1, s57
; GCN-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v37, 0x58, v16
; GCN-NEXT: v_mov_b32_e32 v1, s58
; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s5, 0x60
; GCN-NEXT: v_add_u32_e32 v2, 8, v0
; GCN-NEXT: v_add_u32_e32 v3, 12, v0
; GCN-NEXT: v_add_u32_e32 v7, 16, v0
; GCN-NEXT: v_add_u32_e32 v9, 20, v0
; GCN-NEXT: v_add_u32_e32 v11, 24, v0
; GCN-NEXT: v_add_u32_e32 v13, 28, v0
; GCN-NEXT: v_add_u32_e32 v15, 32, v0
; GCN-NEXT: v_mov_b32_e32 v16, s20
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v17, 36, v0
; GCN-NEXT: v_mov_b32_e32 v18, s21
; GCN-NEXT: v_mov_b32_e32 v26, s25
; GCN-NEXT: v_add_u32_e32 v33, 0x44, v0
; GCN-NEXT: v_mov_b32_e32 v34, s69
; GCN-NEXT: v_mov_b32_e32 v4, s71
; GCN-NEXT: v_add_u32_e32 v19, 40, v0
; GCN-NEXT: v_mov_b32_e32 v20, s22
; GCN-NEXT: v_add_u32_e32 v21, 44, v0
; GCN-NEXT: v_mov_b32_e32 v22, s23
; GCN-NEXT: v_add_u32_e32 v23, 48, v0
; GCN-NEXT: v_mov_b32_e32 v24, s24
; GCN-NEXT: v_add_u32_e32 v25, 52, v0
; GCN-NEXT: v_add_u32_e32 v27, 56, v0
; GCN-NEXT: v_mov_b32_e32 v28, s26
; GCN-NEXT: v_add_u32_e32 v29, 60, v0
; GCN-NEXT: v_mov_b32_e32 v30, s27
; GCN-NEXT: v_add_u32_e32 v31, 64, v0
; GCN-NEXT: v_mov_b32_e32 v32, s68
; GCN-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s13, 0x70
; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0
; GCN-NEXT: v_mov_b32_e32 v36, s70
; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0
; GCN-NEXT: v_add_u32_e32 v38, s4, v0
; GCN-NEXT: v_mov_b32_e32 v5, s72
; GCN-NEXT: v_add_u32_e32 v39, 0x54, v0
; GCN-NEXT: v_mov_b32_e32 v6, s73
; GCN-NEXT: v_add_u32_e32 v40, 0x58, v0
; GCN-NEXT: v_mov_b32_e32 v8, s74
; GCN-NEXT: v_add_u32_e32 v41, 0x5c, v0
; GCN-NEXT: v_mov_b32_e32 v10, s75
; GCN-NEXT: v_add_u32_e32 v42, s5, v0
; GCN-NEXT: v_mov_b32_e32 v12, s76
; GCN-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v36, v35, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v38, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v40, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v41, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v42, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0
; GCN-NEXT: v_mov_b32_e32 v14, s77
; GCN-NEXT: v_mov_b32_e32 v4, s81
; GCN-NEXT: s_movk_i32 s14, 0x90
; GCN-NEXT: s_movk_i32 s15, 0xa0
; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0
; GCN-NEXT: v_mov_b32_e32 v16, s78
; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0
; GCN-NEXT: v_mov_b32_e32 v18, s79
; GCN-NEXT: v_add_u32_e32 v32, s13, v0
; GCN-NEXT: v_mov_b32_e32 v20, s80
; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0
; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0
; GCN-NEXT: v_mov_b32_e32 v5, s82
; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0
; GCN-NEXT: v_mov_b32_e32 v6, s83
; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0
; GCN-NEXT: v_mov_b32_e32 v8, s52
; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v43, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0
; GCN-NEXT: v_mov_b32_e32 v4, s53
; GCN-NEXT: s_movk_i32 s16, 0xb0
; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0
; GCN-NEXT: v_mov_b32_e32 v5, s54
; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0
; GCN-NEXT: v_mov_b32_e32 v6, s55
; GCN-NEXT: v_add_u32_e32 v48, s14, v0
; GCN-NEXT: v_mov_b32_e32 v8, s56
; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0
; GCN-NEXT: v_mov_b32_e32 v10, s57
; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0
; GCN-NEXT: v_mov_b32_e32 v12, s58
; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0
; GCN-NEXT: v_mov_b32_e32 v14, s59
; GCN-NEXT: v_add_u32_e32 v52, s15, v0
; GCN-NEXT: v_mov_b32_e32 v16, s60
; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v47, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0
; GCN-NEXT: v_mov_b32_e32 v4, s61
; GCN-NEXT: s_movk_i32 s17, 0xd0
; GCN-NEXT: s_movk_i32 s18, 0xe0
; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0
; GCN-NEXT: v_mov_b32_e32 v5, s62
; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0
; GCN-NEXT: v_mov_b32_e32 v6, s63
; GCN-NEXT: v_add_u32_e32 v56, s16, v0
; GCN-NEXT: v_mov_b32_e32 v8, s64
; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0
; GCN-NEXT: v_mov_b32_e32 v10, s65
; GCN-NEXT: v_add_u32_e32 v58, 0xb8, v0
; GCN-NEXT: v_mov_b32_e32 v12, s66
; GCN-NEXT: v_add_u32_e32 v59, 0xbc, v0
; GCN-NEXT: v_mov_b32_e32 v14, s67
; GCN-NEXT: v_add_u32_e32 v60, 0xc0, v0
; GCN-NEXT: v_mov_b32_e32 v16, s36
; GCN-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v54, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v55, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v56, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v59, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v60, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0
; GCN-NEXT: v_mov_b32_e32 v4, s37
; GCN-NEXT: v_add_u32_e32 v38, 0x5c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s59
; GCN-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v39, s5, v16
; GCN-NEXT: v_mov_b32_e32 v1, s60
; GCN-NEXT: buffer_store_dword v1, v39, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v40, 0x64, v16
; GCN-NEXT: v_mov_b32_e32 v1, s61
; GCN-NEXT: buffer_store_dword v1, v40, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v41, 0x68, v16
; GCN-NEXT: v_mov_b32_e32 v1, s62
; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s10, 0x70
; GCN-NEXT: v_add_u32_e32 v42, 0x6c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s63
; GCN-NEXT: buffer_store_dword v1, v42, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v43, s10, v16
; GCN-NEXT: v_mov_b32_e32 v1, s64
; GCN-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v44, 0x74, v16
; GCN-NEXT: v_mov_b32_e32 v1, s65
; GCN-NEXT: buffer_store_dword v1, v44, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v45, 0x78, v16
; GCN-NEXT: v_mov_b32_e32 v1, s66
; GCN-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s67
; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16
; GCN-NEXT: v_mov_b32_e32 v1, s36
; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16
; GCN-NEXT: v_mov_b32_e32 v1, s38
; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s11, 0x90
; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s39
; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v51, s11, v16
; GCN-NEXT: v_mov_b32_e32 v1, s40
; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16
; GCN-NEXT: v_mov_b32_e32 v1, s41
; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16
; GCN-NEXT: v_mov_b32_e32 v1, s42
; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s28, 0xa0
; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s43
; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v55, s28, v16
; GCN-NEXT: v_mov_b32_e32 v1, s44
; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s45
; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s46
; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s29, 0xb0
; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16
; GCN-NEXT: v_mov_b32_e32 v1, s47
; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v59, s29, v16
; GCN-NEXT: v_mov_b32_e32 v1, s48
; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s49
; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s50
; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s51
; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s12
; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16
; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16
; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s14
; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16
; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s12, 0xd0
; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s15
; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v67, s12, v16
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s18
; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s13, 0xe0
; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s19
; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v71, s13, v16
; GCN-NEXT: v_mov_b32_e32 v1, s20
; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s22
; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s14, 0xf0
; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16
; GCN-NEXT: v_mov_b32_e32 v1, s23
; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v75, s14, v16
; GCN-NEXT: v_mov_b32_e32 v1, s24
; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s25
; GCN-NEXT: s_and_b32 s7, s7, 63
; GCN-NEXT: s_movk_i32 s19, 0xf0
; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0
; GCN-NEXT: v_mov_b32_e32 v5, s38
; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0
; GCN-NEXT: v_mov_b32_e32 v6, s39
; GCN-NEXT: v_add_u32_e32 v64, s17, v0
; GCN-NEXT: v_mov_b32_e32 v8, s40
; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0
; GCN-NEXT: v_mov_b32_e32 v10, s41
; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0
; GCN-NEXT: v_mov_b32_e32 v12, s42
; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0
; GCN-NEXT: v_mov_b32_e32 v14, s43
; GCN-NEXT: v_add_u32_e32 v68, s18, v0
; GCN-NEXT: v_mov_b32_e32 v16, s44
; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v63, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v64, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v65, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v66, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v67, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v68, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v69, 0xe4, v0
; GCN-NEXT: v_mov_b32_e32 v4, s45
; GCN-NEXT: v_add_u32_e32 v70, 0xe8, v0
; GCN-NEXT: v_mov_b32_e32 v5, s46
; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0
; GCN-NEXT: v_mov_b32_e32 v6, s47
; GCN-NEXT: v_add_u32_e32 v72, s19, v0
; GCN-NEXT: v_mov_b32_e32 v8, s48
; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0
; GCN-NEXT: v_mov_b32_e32 v10, s49
; GCN-NEXT: v_add_u32_e32 v74, 0xf8, v0
; GCN-NEXT: v_mov_b32_e32 v12, s50
; GCN-NEXT: buffer_store_dword v4, v69, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v70, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v71, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v72, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v73, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v74, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s26
; GCN-NEXT: v_add_u32_e32 v17, 8, v16
; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s27
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0
; GCN-NEXT: v_mov_b32_e32 v14, s51
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256
; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_add_u32_e32 v0, s7, v0
; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v4, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v5, v9, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v6, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v7, v13, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v8, v15, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v9, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v10, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v11, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v12, v23, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v13, v25, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v14, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v15, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v18, 12, v16
; GCN-NEXT: v_add_u32_e32 v19, 16, v16
; GCN-NEXT: v_add_u32_e32 v20, 20, v16
; GCN-NEXT: v_add_u32_e32 v21, 24, v16
; GCN-NEXT: v_add_u32_e32 v22, 28, v16
; GCN-NEXT: v_add_u32_e32 v23, 32, v16
; GCN-NEXT: v_add_u32_e32 v24, 36, v16
; GCN-NEXT: v_add_u32_e32 v25, 40, v16
; GCN-NEXT: v_add_u32_e32 v26, 44, v16
; GCN-NEXT: v_add_u32_e32 v27, 48, v16
; GCN-NEXT: v_add_u32_e32 v28, 52, v16
; GCN-NEXT: v_add_u32_e32 v29, 56, v16
; GCN-NEXT: v_add_u32_e32 v30, 60, v16
; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_add_u32_e32 v1, s7, v16
; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v26, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v5, v20, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v6, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v7, v22, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v8, v23, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v9, v24, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v10, v25, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v11, v26, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v12, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v13, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v14, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v15, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v16, v31, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v17, v33, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v18, v35, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v19, v37, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v20, v38, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v21, v39, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v22, v40, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v23, v41, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v24, v42, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v25, v26, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v26, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v27, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v28, v32, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v29, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v30, v36, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v31, v43, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v32, v44, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v33, v45, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v34, v46, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v35, v47, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v36, v48, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v37, v49, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v38, v50, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v39, v51, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v40, v52, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v41, v53, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v42, v54, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v43, v55, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v44, v56, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v45, v57, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v46, v58, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v47, v59, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v48, v60, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v49, v61, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v50, v62, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v51, v63, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v52, v64, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v53, v65, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v54, v66, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v55, v67, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v56, v68, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v57, v69, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v58, v70, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v59, v71, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v60, v72, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v61, v73, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v62, v74, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v63, v75, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v17, v32, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v18, v33, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v19, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v20, v35, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v21, v36, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v22, v37, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v23, v38, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v24, v39, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v25, v40, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v26, v41, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v27, v42, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v28, v43, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v29, v44, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v30, v45, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v31, v46, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v32, v47, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v33, v48, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v34, v49, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v35, v50, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v36, v51, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v37, v52, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v38, v53, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v39, v54, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v40, v55, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v41, v56, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v42, v57, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v43, v58, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v44, v59, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v45, v60, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v46, v61, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v47, v62, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v48, v63, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v49, v64, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v50, v65, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v51, v66, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v52, v67, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v53, v68, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v54, v69, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v55, v70, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v56, v71, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v57, v72, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v58, v73, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v59, v74, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v60, v75, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v61, v76, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256
; GCN-NEXT: s_add_u32 s6, s8, 16
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v67, s7
; GCN-NEXT: v_mov_b32_e32 v66, s6
; GCN-NEXT: s_add_u32 s6, s8, 32
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v65, s9
; GCN-NEXT: s_add_u32 s10, s8, 48
; GCN-NEXT: s_add_u32 s6, s8, 16
; GCN-NEXT: v_mov_b32_e32 v64, s8
; GCN-NEXT: s_addc_u32 s11, s9, 0
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_add_u32 s6, s8, 32
; GCN-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_add_u32 s6, s8, 48
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_add_u32 s6, s8, 64
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_add_u32 s10, s8, s4
; GCN-NEXT: s_addc_u32 s11, s9, 0
; GCN-NEXT: s_add_u32 s4, s8, s5
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_add_u32 s6, s8, s13
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off
; GCN-NEXT: s_add_u32 s6, s8, s4
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_add_u32 s4, s8, s5
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: global_store_dwordx4 v[0:1], v[20:23], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s10
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, 0x80
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: global_store_dwordx4 v[0:1], v[28:31], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: s_add_u32 s6, s8, s14
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s15
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: s_add_u32 s6, s8, s16
; GCN-NEXT: s_add_u32 s4, s8, s11
; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s28
; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s29
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, 0xc0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off
; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s17
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_add_u32 s4, s8, s18
; GCN-NEXT: s_add_u32 s4, s8, s12
; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s19
; GCN-NEXT: s_add_u32 s4, s8, s13
; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s14
; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[60:63], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: global_store_dwordx4 v[0:1], v[60:63], off
; GCN-NEXT: s_endpgm
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx

View File

@ -1954,7 +1954,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: s_lshr_b32 s7, s5, 1
; GFX9-NEXT: s_cmp_eq_u32 s7, 1
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cselect_b32 s0, s9, s8
; GFX9-NEXT: s_cmp_eq_u32 s7, 2
@ -1997,16 +1997,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v11, s1
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_mov_b32_e32 v10, s0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_s:
@ -2015,7 +2015,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: s_lshr_b32 s7, s5, 1
; GFX8-NEXT: s_cmp_eq_u32 s7, 1
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_cselect_b32 s0, s9, s8
; GFX8-NEXT: s_cmp_eq_u32 s7, 2
@ -2058,16 +2058,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: v_mov_b32_e32 v7, s7
; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
; GFX7-LABEL: insertelement_s_v16i16_s_s:
@ -2108,24 +2108,25 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX7-NEXT: s_cmp_eq_u32 s7, 4
; GFX7-NEXT: s_cselect_b32 s4, s16, s12
; GFX7-NEXT: s_cmp_eq_u32 s7, 5
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_cselect_b32 s5, s16, s13
; GFX7-NEXT: s_cmp_eq_u32 s7, 6
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_cselect_b32 s6, s16, s14
; GFX7-NEXT: s_cmp_eq_u32 s7, 7
; GFX7-NEXT: s_cselect_b32 s7, s16, s15
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: v_mov_b32_e32 v7, s7
; GFX7-NEXT: s_cselect_b32 s7, s16, s15
; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GFX7-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -2329,23 +2330,23 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
; GFX9-NEXT: v_mov_b32_e32 v6, s14
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
; GFX9-NEXT: v_mov_b32_e32 v7, s15
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v11, s1
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, s0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_s:
@ -2390,23 +2391,23 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
; GFX8-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
; GFX8-NEXT: v_mov_b32_e32 v6, s14
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GFX8-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
; GFX8-NEXT: v_mov_b32_e32 v7, s15
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
;
; GFX7-LABEL: insertelement_s_v16i16_v_s:
@ -2509,8 +2510,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: v_mov_b32_e32 v4, s20
; GFX9-NEXT: v_mov_b32_e32 v5, s21
@ -2518,8 +2519,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
@ -2528,11 +2527,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v11, s1
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, s0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_v:
@ -2572,8 +2573,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
; GFX8-NEXT: v_or_b32_e32 v9, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: v_mov_b32_e32 v3, s19
; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s21
@ -2581,8 +2582,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_mov_b32_e32 v7, s23
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
@ -2591,11 +2590,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
;
; GFX7-LABEL: insertelement_s_v16i16_s_v:
@ -2699,8 +2700,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: v_mov_b32_e32 v4, s16
; GFX9-NEXT: v_mov_b32_e32 v5, s17
@ -2708,8 +2709,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_mov_b32_e32 v7, s19
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
@ -2718,11 +2717,13 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v11, s1
; GFX9-NEXT: s_add_u32 s0, 0, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, s0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_addc_u32 s1, 0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_v:
@ -2761,8 +2762,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
; GFX8-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NEXT: v_mov_b32_e32 v5, s17
@ -2770,8 +2771,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_mov_b32_e32 v7, s19
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
@ -2780,11 +2779,13 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: s_add_u32 s0, 0, 16
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX8-NEXT: s_addc_u32 s1, 0, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
;
; GFX7-LABEL: insertelement_s_v16i16_v_v:

View File

@ -8,39 +8,39 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0
; GCN-NEXT: s_movk_i32 s4, 0x80
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v0, v64
; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v6
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2
; GCN-NEXT: s_movk_i32 s4, 0xc0
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v6, v4
; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v7, v5, vcc
; GCN-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
; GCN-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:32
; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v0, v64
; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v4
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16
; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32
; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GCN-NEXT: s_movk_i32 s4, 0xc0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1]
; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:16
; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:32
; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:48
; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:64
; GCN-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:48
; GCN-NEXT: global_load_dwordx4 v[20:23], v[16:17], off offset:16
; GCN-NEXT: global_load_dwordx4 v[24:27], v[16:17], off offset:32
; GCN-NEXT: global_load_dwordx4 v[28:31], v[16:17], off offset:48
; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16
; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32
; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48
; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
@ -55,8 +55,8 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:16
; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:32
; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:48
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:64
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:80
; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:96
; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:112

View File

@ -177,35 +177,35 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
; GFX7-LABEL: store_lds_v4i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX7-NEXT: ds_write_b8 v0, v1
; GFX7-NEXT: ds_write_b8 v0, v5 offset:1
; GFX7-NEXT: ds_write_b8 v0, v6 offset:2
; GFX7-NEXT: ds_write_b8 v0, v7 offset:3
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
; GFX7-NEXT: ds_write_b8 v0, v8 offset:5
; GFX7-NEXT: ds_write_b8 v0, v9 offset:6
; GFX7-NEXT: ds_write_b8 v0, v10 offset:7
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
; GFX7-NEXT: ds_write_b8 v0, v5 offset:6
; GFX7-NEXT: ds_write_b8 v0, v6 offset:7
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4
; GFX7-NEXT: ds_write_b8 v0, v3 offset:8
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
; GFX7-NEXT: ds_write_b8 v0, v2 offset:10
; GFX7-NEXT: ds_write_b8 v0, v5 offset:11
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v4
; GFX7-NEXT: ds_write_b8 v0, v4 offset:12
; GFX7-NEXT: ds_write_b8 v0, v6 offset:13
; GFX7-NEXT: ds_write_b8 v0, v7 offset:14
; GFX7-NEXT: ds_write_b8 v0, v8 offset:15
; GFX7-NEXT: ds_write_b8 v0, v1 offset:13
; GFX7-NEXT: ds_write_b8 v0, v2 offset:14
; GFX7-NEXT: ds_write_b8 v0, v3 offset:15
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
@ -227,17 +227,17 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2
; GFX7-NEXT: ds_write_b8 v0, v1
; GFX7-NEXT: ds_write_b8 v0, v4 offset:1
; GFX7-NEXT: ds_write_b8 v0, v5 offset:2
; GFX7-NEXT: ds_write_b8 v0, v6 offset:3
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
; GFX7-NEXT: ds_write_b8 v0, v7 offset:5
; GFX7-NEXT: ds_write_b8 v0, v8 offset:6
; GFX7-NEXT: ds_write_b8 v0, v9 offset:7
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
; GFX7-NEXT: ds_write_b8 v0, v4 offset:6
; GFX7-NEXT: ds_write_b8 v0, v5 offset:7
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v3

View File

@ -43,50 +43,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s5, s0, 8
; GFX9-NEXT: ds_write_b8 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
; GFX9-NEXT: s_lshr_b32 s7, s0, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_lshr_b32 s5, s1, 24
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
; GFX9-NEXT: v_mov_b32_e32 v7, s4
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v8, s5
; GFX9-NEXT: ds_write_b8 v1, v0
; GFX9-NEXT: ds_write_b8 v1, v2 offset:1
; GFX9-NEXT: ds_write_b8 v1, v3 offset:2
; GFX9-NEXT: ds_write_b8 v1, v4 offset:3
; GFX9-NEXT: ds_write_b8 v1, v5 offset:4
; GFX9-NEXT: ds_write_b8 v1, v6 offset:5
; GFX9-NEXT: ds_write_b8 v1, v7 offset:6
; GFX9-NEXT: ds_write_b8 v1, v8 offset:7
; GFX9-NEXT: s_lshr_b32 s4, s2, 24
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_lshr_b32 s0, s3, 8
; GFX9-NEXT: s_lshr_b32 s1, s3, 16
; GFX9-NEXT: s_lshr_b32 s2, s3, 24
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: v_mov_b32_e32 v7, s1
; GFX9-NEXT: v_mov_b32_e32 v8, s2
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
; GFX9-NEXT: ds_write_b8 v1, v2 offset:9
; GFX9-NEXT: ds_write_b8 v1, v3 offset:10
; GFX9-NEXT: ds_write_b8 v1, v4 offset:11
; GFX9-NEXT: ds_write_b8 v1, v5 offset:12
; GFX9-NEXT: ds_write_b8 v1, v6 offset:13
; GFX9-NEXT: ds_write_b8 v1, v7 offset:14
; GFX9-NEXT: ds_write_b8 v1, v8 offset:15
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
; GFX9-NEXT: s_lshr_b32 s4, s2, 24
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: s_lshr_b32 s0, s3, 8
; GFX9-NEXT: ds_write_b8 v1, v0 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s1, s3, 16
; GFX9-NEXT: ds_write_b8 v1, v0 offset:13
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshr_b32 s2, s3, 24
; GFX9-NEXT: ds_write_b8 v1, v0 offset:14
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: ds_write_b8 v1, v0 offset:15
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align1:
@ -96,50 +96,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s5, s0, 8
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: s_lshr_b32 s6, s0, 16
; GFX7-NEXT: s_lshr_b32 s7, s0, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s7
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_lshr_b32 s0, s1, 8
; GFX7-NEXT: v_mov_b32_e32 v2, s5
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s4, s1, 16
; GFX7-NEXT: s_lshr_b32 s5, s1, 24
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
; GFX7-NEXT: v_mov_b32_e32 v7, s4
; GFX7-NEXT: v_mov_b32_e32 v4, s7
; GFX7-NEXT: v_mov_b32_e32 v8, s5
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: ds_write_b8 v1, v2 offset:1
; GFX7-NEXT: ds_write_b8 v1, v3 offset:2
; GFX7-NEXT: ds_write_b8 v1, v4 offset:3
; GFX7-NEXT: ds_write_b8 v1, v5 offset:4
; GFX7-NEXT: ds_write_b8 v1, v6 offset:5
; GFX7-NEXT: ds_write_b8 v1, v7 offset:6
; GFX7-NEXT: ds_write_b8 v1, v8 offset:7
; GFX7-NEXT: s_lshr_b32 s4, s2, 24
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: s_lshr_b32 s0, s3, 8
; GFX7-NEXT: s_lshr_b32 s1, s3, 16
; GFX7-NEXT: s_lshr_b32 s2, s3, 24
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v6, s0
; GFX7-NEXT: v_mov_b32_e32 v7, s1
; GFX7-NEXT: v_mov_b32_e32 v8, s2
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
; GFX7-NEXT: ds_write_b8 v1, v2 offset:9
; GFX7-NEXT: ds_write_b8 v1, v3 offset:10
; GFX7-NEXT: ds_write_b8 v1, v4 offset:11
; GFX7-NEXT: ds_write_b8 v1, v5 offset:12
; GFX7-NEXT: ds_write_b8 v1, v6 offset:13
; GFX7-NEXT: ds_write_b8 v1, v7 offset:14
; GFX7-NEXT: ds_write_b8 v1, v8 offset:15
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
; GFX7-NEXT: s_lshr_b32 s4, s2, 24
; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: s_lshr_b32 s0, s3, 8
; GFX7-NEXT: ds_write_b8 v1, v0 offset:12
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s1, s3, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:13
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_lshr_b32 s2, s3, 24
; GFX7-NEXT: ds_write_b8 v1, v0 offset:14
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_write_b8 v1, v0 offset:15
; GFX7-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
ret void
@ -152,26 +152,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: s_lshr_b32 s0, s3, 16
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_mov_b32_e32 v7, s3
; GFX9-NEXT: v_mov_b32_e32 v8, s0
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: ds_write_b16 v1, v2 offset:2
; GFX9-NEXT: ds_write_b16 v1, v3 offset:4
; GFX9-NEXT: ds_write_b16 v1, v4 offset:6
; GFX9-NEXT: ds_write_b16 v1, v5 offset:8
; GFX9-NEXT: ds_write_b16 v1, v6 offset:10
; GFX9-NEXT: ds_write_b16 v1, v7 offset:12
; GFX9-NEXT: ds_write_b16 v1, v8 offset:14
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: ds_write_b16 v1, v0 offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
; GFX9-NEXT: ds_write_b16 v1, v0 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b16 v1, v0 offset:6
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: ds_write_b16 v1, v0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b16 v1, v0 offset:10
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: s_lshr_b32 s0, s3, 16
; GFX9-NEXT: ds_write_b16 v1, v0 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b16 v1, v0 offset:14
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align2:
@ -181,26 +181,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
; GFX7-NEXT: v_mov_b32_e32 v6, s0
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
; GFX7-NEXT: v_mov_b32_e32 v2, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v5, s2
; GFX7-NEXT: v_mov_b32_e32 v7, s3
; GFX7-NEXT: v_mov_b32_e32 v8, s0
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
; GFX7-NEXT: ds_write_b16 v1, v0
; GFX7-NEXT: ds_write_b16 v1, v2 offset:2
; GFX7-NEXT: ds_write_b16 v1, v3 offset:4
; GFX7-NEXT: ds_write_b16 v1, v4 offset:6
; GFX7-NEXT: ds_write_b16 v1, v5 offset:8
; GFX7-NEXT: ds_write_b16 v1, v6 offset:10
; GFX7-NEXT: ds_write_b16 v1, v7 offset:12
; GFX7-NEXT: ds_write_b16 v1, v8 offset:14
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: ds_write_b16 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
; GFX7-NEXT: ds_write_b16 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b16 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
; GFX7-NEXT: ds_write_b16 v1, v0 offset:8
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b16 v1, v0 offset:10
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
; GFX7-NEXT: ds_write_b16 v1, v0 offset:12
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b16 v1, v0 offset:14
; GFX7-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
ret void

View File

@ -41,39 +41,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: ds_write_b8 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_lshr_b32 s4, s1, 24
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: v_mov_b32_e32 v7, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: ds_write_b8 v1, v0
; GFX9-NEXT: ds_write_b8 v1, v2 offset:1
; GFX9-NEXT: ds_write_b8 v1, v3 offset:2
; GFX9-NEXT: ds_write_b8 v1, v4 offset:3
; GFX9-NEXT: ds_write_b8 v1, v5 offset:4
; GFX9-NEXT: ds_write_b8 v1, v6 offset:5
; GFX9-NEXT: ds_write_b8 v1, v7 offset:6
; GFX9-NEXT: ds_write_b8 v1, v8 offset:7
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
; GFX9-NEXT: s_lshr_b32 s3, s2, 24
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
; GFX9-NEXT: ds_write_b8 v1, v2 offset:9
; GFX9-NEXT: ds_write_b8 v1, v3 offset:10
; GFX9-NEXT: ds_write_b8 v1, v4 offset:11
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshr_b32 s3, s2, 24
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align1:
@ -83,39 +83,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s3, s0, 8
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
; GFX7-NEXT: s_lshr_b32 s6, s0, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_lshr_b32 s0, s1, 8
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s3, s1, 16
; GFX7-NEXT: s_lshr_b32 s4, s1, 24
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s0
; GFX7-NEXT: v_mov_b32_e32 v7, s3
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v8, s4
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: ds_write_b8 v1, v2 offset:1
; GFX7-NEXT: ds_write_b8 v1, v3 offset:2
; GFX7-NEXT: ds_write_b8 v1, v4 offset:3
; GFX7-NEXT: ds_write_b8 v1, v5 offset:4
; GFX7-NEXT: ds_write_b8 v1, v6 offset:5
; GFX7-NEXT: ds_write_b8 v1, v7 offset:6
; GFX7-NEXT: ds_write_b8 v1, v8 offset:7
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s3
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
; GFX7-NEXT: ds_write_b8 v1, v2 offset:9
; GFX7-NEXT: ds_write_b8 v1, v3 offset:10
; GFX7-NEXT: ds_write_b8 v1, v4 offset:11
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
; GFX7-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
ret void
@ -128,21 +128,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: ds_write_b16 v1, v2 offset:2
; GFX9-NEXT: ds_write_b16 v1, v3 offset:4
; GFX9-NEXT: ds_write_b16 v1, v4 offset:6
; GFX9-NEXT: ds_write_b16 v1, v5 offset:8
; GFX9-NEXT: ds_write_b16 v1, v6 offset:10
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: ds_write_b16 v1, v0 offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
; GFX9-NEXT: ds_write_b16 v1, v0 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b16 v1, v0 offset:6
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: ds_write_b16 v1, v0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b16 v1, v0 offset:10
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align2:
@ -152,21 +152,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: s_lshr_b32 s3, s0, 16
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v5, s2
; GFX7-NEXT: v_mov_b32_e32 v6, s0
; GFX7-NEXT: s_lshr_b32 s3, s0, 16
; GFX7-NEXT: ds_write_b16 v1, v0
; GFX7-NEXT: ds_write_b16 v1, v2 offset:2
; GFX7-NEXT: ds_write_b16 v1, v3 offset:4
; GFX7-NEXT: ds_write_b16 v1, v4 offset:6
; GFX7-NEXT: ds_write_b16 v1, v5 offset:8
; GFX7-NEXT: ds_write_b16 v1, v6 offset:10
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: ds_write_b16 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
; GFX7-NEXT: ds_write_b16 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b16 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
; GFX7-NEXT: ds_write_b16 v1, v0 offset:8
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b16 v1, v0 offset:10
; GFX7-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
ret void

View File

@ -3316,13 +3316,14 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GCN-NEXT: v_and_b32_e32 v2, s3, v3
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc
; GCN-NEXT: v_and_b32_e32 v3, s3, v4
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GCN-NEXT: s_endpgm
%r = udiv <3 x i15> %x, %y
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
@ -3460,9 +3461,10 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GCN-NEXT: s_endpgm
%r = urem <3 x i15> %x, %y
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
@ -3612,9 +3614,10 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GCN-NEXT: s_endpgm
%r = sdiv <3 x i15> %x, %y
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
@ -3780,13 +3783,14 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3
; GCN-NEXT: v_and_b32_e32 v3, s3, v3
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GCN-NEXT: s_endpgm
%r = srem <3 x i15> %x, %y
store <3 x i15> %r, <3 x i15> addrspace(1)* %out

View File

@ -744,13 +744,13 @@ entry:
; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN: s_getpc_b64
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
@ -777,12 +777,12 @@ entry:
; GCN-LABEL: {{^}}stack_12xv3i32:
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG12]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
; GCN: v_mov_b32_e32 v31, 11
; GCN: s_getpc
@ -806,12 +806,12 @@ entry:
; GCN-LABEL: {{^}}stack_12xv3f32:
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG12]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
; GCN: v_mov_b32_e32 v31, 0x41300000
; GCN: s_getpc
@ -836,20 +836,20 @@ entry:
; GCN-LABEL: {{^}}stack_8xv5i32:
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN: v_mov_b32_e32 v31, 7
@ -870,20 +870,20 @@ entry:
; GCN-LABEL: {{^}}stack_8xv5f32:
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN: v_mov_b32_e32 v31, 0x40e00000

View File

@ -31,9 +31,7 @@ bb:
%la3 = getelementptr inbounds i32, i32* %lb, i32 6
%ld3 = load i32, i32* %la3
; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
; DBG-NOT: Cluster ld/st
; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
@ -78,13 +76,11 @@ bb:
%la3 = getelementptr inbounds i32, i32* %lb, i32 6
%ld3 = load i32, i32* %la3
; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
; DBG-NOT: Cluster ld/st
; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
%sa0 = getelementptr inbounds i32, i32* %sb, i32 0
store i32 %ld0, i32* %sa0
@ -125,7 +121,6 @@ entry:
; CHECK-LABEL: {{^}}no_cluster_image_load:
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
; DBG-NOT: {{^}}Cluster ld/st
define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) {
entry:

View File

@ -156,28 +156,28 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0
; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s4
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s5
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v4
; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v5
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v5
; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v4
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_endpgm
;
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:

View File

@ -73,9 +73,9 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v2, 2
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, 2
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
@ -140,14 +140,14 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0
; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2
; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]

View File

@ -1084,23 +1084,23 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7
; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24
; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6
; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
; GFX9-NEXT: v_add_u32_e32 v1, 8, v1
; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2
; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2
; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX9-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -312,7 +312,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; GCN: flat_store_dwordx4
@ -326,6 +325,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa

View File

@ -773,12 +773,13 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v6, s6
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <8 x i32> %a, i32 5, i32 %b
store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
@ -910,9 +911,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v3i16:

View File

@ -45,7 +45,7 @@ entry:
; GCN: s_barrier
; SI: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7

View File

@ -70,16 +70,16 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v9, s9
; GCN-NEXT: v_mov_b32_e32 v10, s10
; GCN-NEXT: v_mov_b32_e32 v11, s11
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48
; GCN-NEXT: s_endpgm
bb:

View File

@ -529,8 +529,8 @@ define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %
; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
; GCN: buffer_store_dword v[[HI]]
define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
store i32 9, i32 addrspace(1)* %out, align 4

View File

@ -28,14 +28,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: s_cbranch_scc1 BB0_3
; GCN-NEXT: ; %bb.2: ; %bb.1
; GCN-NEXT: s_add_i32 s6, s32, 0x1000
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_lshl_b32 s7, s10, 2
; GCN-NEXT: s_mov_b32 s32, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_add_i32 s6, s6, s7
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: s_add_i32 s6, s6, s7
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@ -98,14 +98,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_add_i32 s6, s32, 0x1000
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_mov_b32 s32, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_add_i32 s6, s6, s7
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: s_add_i32 s6, s6, s7
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@ -166,9 +166,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; GCN-NEXT: s_add_i32 s6, s32, 0x1000
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v6, 1
; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v2, 1
; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
; GCN-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5
@ -228,9 +228,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, 1
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v2, 1
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
; GCN-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4

View File

@ -249,13 +249,13 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
; CI: v_mov_b32
; CI: v_mov_b32
; CI: v_add_i32
; CI: v_add_i32
; CI-DAG: v_add_i32
; CI-DAG: v_add_i32
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:28

View File

@ -55,42 +55,42 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_lshr_b32 s4, s2, 8
; GFX9-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: s_lshr_b32 s2, s3, 8
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: ds_write_b8 v0, v1 offset:12
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
; GFX9-NEXT: ds_write_b8 v0, v2 offset:8
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_lshr_b32 s2, s0, 8
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: v_mov_b32_e32 v8, s0
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: ds_write_b8 v0, v6
; GFX9-NEXT: ds_write_b8_d16_hi v0, v6 offset:2
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: s_lshr_b32 s0, s3, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:12
; GFX9-NEXT: ds_write_b8 v0, v5 offset:4
; GFX9-NEXT: ds_write_b8 v0, v2 offset:13
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_lshr_b32 s0, s1, 24
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
; GFX9-NEXT: ds_write_b8 v0, v2 offset:15
; GFX9-NEXT: ds_write_b8 v0, v3 offset:9
; GFX9-NEXT: ds_write_b8 v0, v4 offset:11
; GFX9-NEXT: ds_write_b8 v0, v6 offset:5
; GFX9-NEXT: v_mov_b32_e32 v7, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b8 v0, v1 offset:4
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: ds_write_b8_d16_hi v0, v5 offset:6
; GFX9-NEXT: s_lshr_b32 s4, s3, 8
; GFX9-NEXT: ds_write_b8 v0, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_lshr_b32 s3, s3, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:13
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_lshr_b32 s3, s2, 8
; GFX9-NEXT: ds_write_b8 v0, v1 offset:15
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 8
; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
; GFX9-NEXT: ds_write_b8 v0, v7 offset:1
; GFX9-NEXT: ds_write_b8 v0, v8 offset:3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align1:
@ -100,50 +100,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_lshr_b32 s4, s3, 8
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: s_lshr_b32 s4, s3, 16
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s3, s3, 24
; GFX7-NEXT: ds_write_b8 v0, v5 offset:13
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
; GFX7-NEXT: v_mov_b32_e32 v6, s4
; GFX7-NEXT: ds_write_b8 v0, v5 offset:15
; GFX7-NEXT: ds_write_b8 v0, v6 offset:14
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_lshr_b32 s3, s2, 16
; GFX7-NEXT: s_lshr_b32 s2, s2, 24
; GFX7-NEXT: ds_write_b8 v0, v2 offset:8
; GFX7-NEXT: ds_write_b8 v0, v5 offset:9
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_lshr_b32 s2, s1, 8
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: ds_write_b8 v0, v1 offset:12
; GFX7-NEXT: ds_write_b8 v0, v2 offset:11
; GFX7-NEXT: ds_write_b8 v0, v6 offset:10
; GFX7-NEXT: ds_write_b8 v0, v2 offset:8
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b8 v0, v1 offset:4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s4, s3, 8
; GFX7-NEXT: ds_write_b8 v0, v1
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: s_lshr_b32 s4, s3, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:13
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: s_lshr_b32 s3, s3, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:15
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
; GFX7-NEXT: ds_write_b8 v0, v1 offset:14
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s2, s1, 16
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: s_lshr_b32 s1, s1, 24
; GFX7-NEXT: s_lshr_b32 s2, s1, 8
; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s2, s1, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshr_b32 s1, s0, 8
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
; GFX7-NEXT: ds_write_b8 v0, v2 offset:6
; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_lshr_b32 s0, s0, 24
; GFX7-NEXT: ds_write_b8 v0, v4
; GFX7-NEXT: s_lshr_b32 s1, s0, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b8 v0, v3 offset:4
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
; GFX7-NEXT: ds_write_b8 v0, v2 offset:2
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: store_lds_v4i32_align1:
@ -153,50 +153,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_lshr_b32 s4, s3, 8
; GFX6-NEXT: v_mov_b32_e32 v5, s4
; GFX6-NEXT: s_lshr_b32 s4, s3, 16
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: ds_write_b8 v0, v5 offset:13
; GFX6-NEXT: v_mov_b32_e32 v5, s3
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
; GFX6-NEXT: v_mov_b32_e32 v6, s4
; GFX6-NEXT: ds_write_b8 v0, v5 offset:15
; GFX6-NEXT: ds_write_b8 v0, v6 offset:14
; GFX6-NEXT: v_mov_b32_e32 v5, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: s_lshr_b32 s3, s2, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: ds_write_b8 v0, v2 offset:8
; GFX6-NEXT: ds_write_b8 v0, v5 offset:9
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: s_lshr_b32 s2, s1, 8
; GFX6-NEXT: v_mov_b32_e32 v6, s3
; GFX6-NEXT: ds_write_b8 v0, v1 offset:12
; GFX6-NEXT: ds_write_b8 v0, v2 offset:11
; GFX6-NEXT: ds_write_b8 v0, v6 offset:10
; GFX6-NEXT: ds_write_b8 v0, v2 offset:8
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b8 v0, v1 offset:4
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: s_lshr_b32 s4, s3, 8
; GFX6-NEXT: ds_write_b8 v0, v1
; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: s_lshr_b32 s4, s3, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:13
; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:15
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
; GFX6-NEXT: ds_write_b8 v0, v1 offset:14
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s3, s2, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s2, s1, 16
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_lshr_b32 s2, s1, 8
; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s2, s1, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_lshr_b32 s1, s0, 8
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
; GFX6-NEXT: ds_write_b8 v0, v2 offset:6
; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v4, s0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: ds_write_b8 v0, v4
; GFX6-NEXT: s_lshr_b32 s1, s0, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: ds_write_b8 v0, v3 offset:4
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
; GFX6-NEXT: ds_write_b8 v0, v2 offset:2
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
; GFX6-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
ret void
@ -210,17 +210,17 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
; GFX9-NEXT: ds_write_b16 v0, v4
; GFX9-NEXT: ds_write_b16 v0, v3 offset:4
; GFX9-NEXT: ds_write_b16 v0, v2 offset:8
; GFX9-NEXT: ds_write_b16 v0, v1 offset:12
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
; GFX9-NEXT: ds_write_b16 v0, v2 offset:8
; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10
; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:6
; GFX9-NEXT: ds_write_b16_d16_hi v0, v4 offset:2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align2:
@ -230,26 +230,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: v_mov_b32_e32 v5, s0
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: ds_write_b16 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: ds_write_b16 v0, v3 offset:4
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
; GFX7-NEXT: ds_write_b16 v0, v2 offset:8
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: ds_write_b16 v0, v2 offset:14
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: ds_write_b16 v0, v1 offset:12
; GFX7-NEXT: ds_write_b16 v0, v3 offset:10
; GFX7-NEXT: ds_write_b16 v0, v4 offset:6
; GFX7-NEXT: ds_write_b16 v0, v5 offset:2
; GFX7-NEXT: ds_write_b16 v0, v2 offset:8
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b16 v0, v1 offset:4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s3, s3, 16
; GFX7-NEXT: ds_write_b16 v0, v1
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
; GFX7-NEXT: ds_write_b16 v0, v1 offset:14
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: store_lds_v4i32_align2:
@ -259,26 +259,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v4, s0
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: v_mov_b32_e32 v5, s0
; GFX6-NEXT: s_lshr_b32 s0, s1, 16
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: ds_write_b16 v0, v4
; GFX6-NEXT: v_mov_b32_e32 v4, s0
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: ds_write_b16 v0, v3 offset:4
; GFX6-NEXT: v_mov_b32_e32 v3, s0
; GFX6-NEXT: s_lshr_b32 s0, s3, 16
; GFX6-NEXT: ds_write_b16 v0, v2 offset:8
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: ds_write_b16 v0, v2 offset:14
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: ds_write_b16 v0, v1 offset:12
; GFX6-NEXT: ds_write_b16 v0, v3 offset:10
; GFX6-NEXT: ds_write_b16 v0, v4 offset:6
; GFX6-NEXT: ds_write_b16 v0, v5 offset:2
; GFX6-NEXT: ds_write_b16 v0, v2 offset:8
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b16 v0, v1 offset:4
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: ds_write_b16 v0, v1
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: ds_write_b16 v0, v1 offset:14
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
; GFX6-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
ret void
@ -307,10 +307,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s2
; GFX7-NEXT: v_mov_b32_e32 v4, s3
; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX7-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: store_lds_v4i32_align4:

View File

@ -36,10 +36,10 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v2, v3 offset:8
; GFX6-NEXT: ds_write_b64 v2, v[0:1]
; GFX6-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out
@ -53,33 +53,33 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_lshr_b32 s3, s2, 8
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: v_mov_b32_e32 v6, s0
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: ds_write_b8 v0, v4
; GFX9-NEXT: ds_write_b8_d16_hi v0, v4 offset:2
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: s_lshr_b32 s0, s2, 24
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_write_b8 v0, v1 offset:8
; GFX9-NEXT: ds_write_b8 v0, v3 offset:4
; GFX9-NEXT: ds_write_b8 v0, v2 offset:9
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_lshr_b32 s0, s1, 24
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
; GFX9-NEXT: ds_write_b8 v0, v2 offset:11
; GFX9-NEXT: ds_write_b8 v0, v4 offset:5
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: ds_write_b8 v0, v2 offset:4
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: ds_write_b8_d16_hi v0, v3 offset:6
; GFX9-NEXT: s_lshr_b32 s3, s2, 8
; GFX9-NEXT: ds_write_b8 v0, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 8
; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
; GFX9-NEXT: ds_write_b8 v0, v5 offset:1
; GFX9-NEXT: ds_write_b8 v0, v6 offset:3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align1:
@ -89,39 +89,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
; GFX7-NEXT: v_mov_b32_e32 v4, s3
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s3, s2, 16
; GFX7-NEXT: s_lshr_b32 s2, s2, 24
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_write_b8 v0, v1 offset:8
; GFX7-NEXT: ds_write_b8 v0, v4 offset:9
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
; GFX7-NEXT: ds_write_b8 v0, v1
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s2, s1, 8
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
; GFX7-NEXT: ds_write_b8 v0, v5 offset:10
; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s2, s1, 16
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_lshr_b32 s1, s1, 24
; GFX7-NEXT: s_lshr_b32 s2, s1, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshr_b32 s1, s0, 8
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
; GFX7-NEXT: ds_write_b8 v0, v4 offset:6
; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_lshr_b32 s0, s0, 24
; GFX7-NEXT: ds_write_b8 v0, v3
; GFX7-NEXT: s_lshr_b32 s1, s0, 24
; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
; GFX7-NEXT: v_mov_b32_e32 v4, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
; GFX7-NEXT: ds_write_b8 v0, v4 offset:2
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: store_lds_v3i32_align1:
@ -131,39 +131,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
; GFX6-NEXT: v_mov_b32_e32 v4, s3
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s3, s2, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: ds_write_b8 v0, v1 offset:8
; GFX6-NEXT: ds_write_b8 v0, v4 offset:9
; GFX6-NEXT: ds_write_b8 v0, v2 offset:4
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
; GFX6-NEXT: ds_write_b8 v0, v1
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s3, s2, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s2, s1, 8
; GFX6-NEXT: v_mov_b32_e32 v5, s3
; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
; GFX6-NEXT: ds_write_b8 v0, v5 offset:10
; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s2, s1, 16
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_lshr_b32 s2, s1, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_lshr_b32 s1, s0, 8
; GFX6-NEXT: v_mov_b32_e32 v4, s2
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
; GFX6-NEXT: ds_write_b8 v0, v4 offset:6
; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v3, s0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: ds_write_b8 v0, v3
; GFX6-NEXT: s_lshr_b32 s1, s0, 24
; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
; GFX6-NEXT: v_mov_b32_e32 v4, s1
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: ds_write_b8 v0, v2 offset:4
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
; GFX6-NEXT: ds_write_b8 v0, v4 offset:2
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
; GFX6-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
ret void
@ -178,13 +178,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
; GFX9-NEXT: ds_write_b16 v0, v3
; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
; GFX9-NEXT: ds_write_b16 v0, v1 offset:8
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6
; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:2
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align2:
@ -194,21 +194,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_write_b16 v0, v3
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
; GFX7-NEXT: ds_write_b16 v0, v2 offset:4
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: ds_write_b16 v0, v2 offset:10
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_write_b16 v0, v1 offset:8
; GFX7-NEXT: ds_write_b16 v0, v3 offset:6
; GFX7-NEXT: ds_write_b16 v0, v4 offset:2
; GFX7-NEXT: ds_write_b16 v0, v2 offset:4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
; GFX7-NEXT: ds_write_b16 v0, v1
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: store_lds_v3i32_align2:
@ -218,21 +218,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s0
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: v_mov_b32_e32 v4, s0
; GFX6-NEXT: s_lshr_b32 s0, s1, 16
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: ds_write_b16 v0, v3
; GFX6-NEXT: v_mov_b32_e32 v3, s0
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
; GFX6-NEXT: ds_write_b16 v0, v2 offset:4
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: ds_write_b16 v0, v2 offset:10
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: ds_write_b16 v0, v1 offset:8
; GFX6-NEXT: ds_write_b16 v0, v3 offset:6
; GFX6-NEXT: ds_write_b16 v0, v4 offset:2
; GFX6-NEXT: ds_write_b16 v0, v2 offset:4
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: ds_write_b16 v0, v1
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
; GFX6-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
ret void
@ -260,9 +260,9 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s2
; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX7-NEXT: ds_write_b32 v0, v3 offset:8
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: ds_write_b32 v0, v1 offset:8
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: store_lds_v3i32_align4:
@ -302,10 +302,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: ds_write_b32 v2, v1 offset:8
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v2, v3 offset:8
; GFX7-NEXT: ds_write_b64 v2, v[0:1]
; GFX7-NEXT: s_endpgm
;
@ -316,10 +316,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v2, v3 offset:8
; GFX6-NEXT: ds_write_b64 v2, v[0:1]
; GFX6-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
@ -359,10 +359,10 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v2, v3 offset:8
; GFX6-NEXT: ds_write_b64 v2, v[0:1]
; GFX6-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16

View File

@ -6,14 +6,14 @@
define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
; CIVI-LABEL: local_store_i56:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: ds_write_b32 v0, v1
; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
; CIVI-NEXT: ds_write_b32 v0, v1
; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: local_store_i56:
; GFX9: ; %bb.0:
@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_or_b32 s0, s4, 14
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v1, s0
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
; HAWAII-NEXT: s_waitcnt vmcnt(0)
; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
; HAWAII-NEXT: ds_write_b32 v1, v2
; HAWAII-NEXT: s_endpgm
; HAWAII-NEXT: s_or_b32 s0, s4, 14
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v1, s0
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
; HAWAII-NEXT: s_waitcnt vmcnt(0)
; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
; HAWAII-NEXT: ds_write_b32 v1, v2
; HAWAII-NEXT: s_endpgm
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
; FIJI-NEXT: s_or_b32 s0, s4, 14
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s5
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v1, s0
; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: s_waitcnt vmcnt(0)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
; FIJI-NEXT: ds_write_b32 v1, v3
; FIJI-NEXT: s_endpgm
; FIJI-NEXT: s_or_b32 s0, s4, 14
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s5
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v1, s0
; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
; FIJI-NEXT: s_waitcnt vmcnt(0)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
; FIJI-NEXT: ds_write_b32 v1, v3
; FIJI-NEXT: s_endpgm
;
; GFX9-LABEL: local_store_i55:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v2, s3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
; GFX9-NEXT: ds_write_b32 v0, v3
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v1, s3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
; GFX9-NEXT: ds_write_b32 v0, v3
; GFX9-NEXT: s_endpgm
store i55 %arg, i55 addrspace(3)* %ptr, align 8
ret void
}
@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 {
; HAWAII-LABEL: local_store_i48:
; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
; HAWAII-NEXT: v_mov_b32_e32 v1, s2
; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
; HAWAII-NEXT: ds_write_b32 v0, v2
; HAWAII-NEXT: s_endpgm
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s2
; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: ds_write_b32 v0, v1
; HAWAII-NEXT: s_endpgm
;
; FIJI-LABEL: local_store_i48:
; FIJI: ; %bb.0:
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v2, s1
; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
; FIJI-NEXT: ds_write_b32 v0, v2
; FIJI-NEXT: s_endpgm
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: ds_write_b32 v0, v1
; FIJI-NEXT: s_endpgm
;
; GFX9-LABEL: local_store_i48:
; GFX9: ; %bb.0:
@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
; HAWAII-LABEL: local_store_i65:
; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0
; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: s_and_b32 s3, s3, 1
; HAWAII-NEXT: v_mov_b32_e32 v3, s3
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8
; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
; HAWAII-NEXT: s_endpgm
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0
; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
; HAWAII-NEXT: s_and_b32 s3, s3, 1
; HAWAII-NEXT: v_mov_b32_e32 v0, s3
; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
; HAWAII-NEXT: s_endpgm
;
; FIJI-LABEL: local_store_i65:
; FIJI: ; %bb.0:
; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0
; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: s_and_b32 s3, s3, 1
; FIJI-NEXT: v_mov_b32_e32 v3, s3
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: ds_write_b8 v2, v3 offset:8
; FIJI-NEXT: ds_write_b64 v2, v[0:1]
; FIJI-NEXT: s_endpgm
; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0
; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: s_and_b32 s3, s3, 1
; FIJI-NEXT: v_mov_b32_e32 v0, s3
; FIJI-NEXT: ds_write_b8 v2, v0 offset:8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: ds_write_b64 v2, v[0:1]
; FIJI-NEXT: s_endpgm
;
; GFX9-LABEL: local_store_i65:
; GFX9: ; %bb.0:
@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
; CIVI-LABEL: local_store_i17:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1
; CIVI-NEXT: ds_write_b16 v0, v1
; CIVI-NEXT: ds_write_b8 v0, v2 offset:2
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: ds_write_b16 v0, v1
; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1
; CIVI-NEXT: ds_write_b8 v0, v1 offset:2
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: local_store_i17:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1
; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
store i17 %arg, i17 addrspace(3)* %ptr, align 8
ret void
}

View File

@ -5,37 +5,37 @@
; GCN-LABEL: {{^}}token_factor_inline_limit_test:
; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}}
; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 v31, 7

View File

@ -135,12 +135,13 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s7, s7, 34
; SI-NEXT: s_or_b32 s7, s7, 4
; SI-NEXT: s_bfe_u32 s8, s7, 0x10010
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_mov_b32_e32 v1, s8
; SI-NEXT: s_bfe_u32 s8, s7, 0x10010
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i17_constant_load:
@ -157,9 +158,9 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_bfe_u32 s0, s0, 0x10010
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
%load = load i17, i17 addrspace(4)* %arg, align 4
%add = add i17 %load, 34