forked from OSchip/llvm-project
[AMDGPU] Don't cluster stores
Clustering loads has caching benefits, but as far as I know there is no advantage to clustering stores on any AMDGPU subtargets. The disadvantage is that it tends to increase register pressure and restricts scheduling freedom. Differential Revision: https://reviews.llvm.org/D85530
This commit is contained in:
parent
98eaacd73d
commit
c799f873cb
|
@ -283,7 +283,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
|
|||
ScheduleDAGMILive *DAG =
|
||||
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
|
||||
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
|
||||
return DAG;
|
||||
|
@ -294,7 +293,6 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
|
|||
auto DAG = new GCNIterativeScheduler(C,
|
||||
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
return DAG;
|
||||
}
|
||||
|
||||
|
@ -308,7 +306,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
|
|||
auto DAG = new GCNIterativeScheduler(C,
|
||||
GCNIterativeScheduler::SCHEDULE_ILP);
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
|
||||
return DAG;
|
||||
}
|
||||
|
@ -604,7 +601,6 @@ public:
|
|||
createMachineScheduler(MachineSchedContext *C) const override {
|
||||
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
return DAG;
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -10,362 +10,364 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
|
|||
; GCN-NEXT: s_add_u32 s0, s0, s7
|
||||
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x100
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0x100
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: v_add_u32_e32 v1, 4, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v31, 64, v16
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0
|
||||
; GCN-NEXT: s_load_dwordx16 s[68:83], s[10:11], 0x40
|
||||
; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x80
|
||||
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x50
|
||||
; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40
|
||||
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80
|
||||
; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s19
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s19
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s21
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s23
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s24
|
||||
; GCN-NEXT: v_mov_b32_e32 v13, s25
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s26
|
||||
; GCN-NEXT: v_mov_b32_e32 v15, s27
|
||||
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256
|
||||
; GCN-NEXT: v_add_u32_e32 v0, 4, v16
|
||||
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s52
|
||||
; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s53
|
||||
; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s54
|
||||
; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x50
|
||||
; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s55
|
||||
; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v35, s4, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s56
|
||||
; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v36, 0x54, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s57
|
||||
; GCN-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v37, 0x58, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s58
|
||||
; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s5, 0x60
|
||||
; GCN-NEXT: v_add_u32_e32 v2, 8, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v3, 12, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v7, 16, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v9, 20, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v11, 24, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v13, 28, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v15, 32, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s20
|
||||
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v17, 36, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s21
|
||||
; GCN-NEXT: v_mov_b32_e32 v26, s25
|
||||
; GCN-NEXT: v_add_u32_e32 v33, 0x44, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v34, s69
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s71
|
||||
; GCN-NEXT: v_add_u32_e32 v19, 40, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v20, s22
|
||||
; GCN-NEXT: v_add_u32_e32 v21, 44, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v22, s23
|
||||
; GCN-NEXT: v_add_u32_e32 v23, 48, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v24, s24
|
||||
; GCN-NEXT: v_add_u32_e32 v25, 52, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v27, 56, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v28, s26
|
||||
; GCN-NEXT: v_add_u32_e32 v29, 60, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v30, s27
|
||||
; GCN-NEXT: v_add_u32_e32 v31, 64, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v32, s68
|
||||
; GCN-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s13, 0x70
|
||||
; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v36, s70
|
||||
; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v38, s4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s72
|
||||
; GCN-NEXT: v_add_u32_e32 v39, 0x54, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s73
|
||||
; GCN-NEXT: v_add_u32_e32 v40, 0x58, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s74
|
||||
; GCN-NEXT: v_add_u32_e32 v41, 0x5c, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s75
|
||||
; GCN-NEXT: v_add_u32_e32 v42, s5, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s76
|
||||
; GCN-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v36, v35, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v38, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v40, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v41, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v42, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s77
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s81
|
||||
; GCN-NEXT: s_movk_i32 s14, 0x90
|
||||
; GCN-NEXT: s_movk_i32 s15, 0xa0
|
||||
; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s78
|
||||
; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s79
|
||||
; GCN-NEXT: v_add_u32_e32 v32, s13, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v20, s80
|
||||
; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0
|
||||
; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s82
|
||||
; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s83
|
||||
; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s52
|
||||
; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v43, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s53
|
||||
; GCN-NEXT: s_movk_i32 s16, 0xb0
|
||||
; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s54
|
||||
; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s55
|
||||
; GCN-NEXT: v_add_u32_e32 v48, s14, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s56
|
||||
; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s57
|
||||
; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s58
|
||||
; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s59
|
||||
; GCN-NEXT: v_add_u32_e32 v52, s15, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s60
|
||||
; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v47, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s61
|
||||
; GCN-NEXT: s_movk_i32 s17, 0xd0
|
||||
; GCN-NEXT: s_movk_i32 s18, 0xe0
|
||||
; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s62
|
||||
; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s63
|
||||
; GCN-NEXT: v_add_u32_e32 v56, s16, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s64
|
||||
; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s65
|
||||
; GCN-NEXT: v_add_u32_e32 v58, 0xb8, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s66
|
||||
; GCN-NEXT: v_add_u32_e32 v59, 0xbc, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s67
|
||||
; GCN-NEXT: v_add_u32_e32 v60, 0xc0, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s36
|
||||
; GCN-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v54, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v55, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v56, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v14, v59, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v16, v60, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s37
|
||||
; GCN-NEXT: v_add_u32_e32 v38, 0x5c, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s59
|
||||
; GCN-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v39, s5, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s60
|
||||
; GCN-NEXT: buffer_store_dword v1, v39, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v40, 0x64, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s61
|
||||
; GCN-NEXT: buffer_store_dword v1, v40, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v41, 0x68, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s62
|
||||
; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s10, 0x70
|
||||
; GCN-NEXT: v_add_u32_e32 v42, 0x6c, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s63
|
||||
; GCN-NEXT: buffer_store_dword v1, v42, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v43, s10, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s64
|
||||
; GCN-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v44, 0x74, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s65
|
||||
; GCN-NEXT: buffer_store_dword v1, v44, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v45, 0x78, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s66
|
||||
; GCN-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s67
|
||||
; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s36
|
||||
; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s37
|
||||
; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s38
|
||||
; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s11, 0x90
|
||||
; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s39
|
||||
; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v51, s11, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s40
|
||||
; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s41
|
||||
; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s42
|
||||
; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s28, 0xa0
|
||||
; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s43
|
||||
; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v55, s28, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s44
|
||||
; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s45
|
||||
; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s46
|
||||
; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s29, 0xb0
|
||||
; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s47
|
||||
; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v59, s29, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s48
|
||||
; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s49
|
||||
; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s50
|
||||
; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s51
|
||||
; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s12
|
||||
; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16
|
||||
; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16
|
||||
; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s14
|
||||
; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16
|
||||
; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s12, 0xd0
|
||||
; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s15
|
||||
; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v67, s12, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s17
|
||||
; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s18
|
||||
; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s13, 0xe0
|
||||
; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s19
|
||||
; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v71, s13, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s20
|
||||
; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s21
|
||||
; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s22
|
||||
; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_movk_i32 s14, 0xf0
|
||||
; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s23
|
||||
; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v75, s14, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s24
|
||||
; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s25
|
||||
; GCN-NEXT: s_and_b32 s7, s7, 63
|
||||
; GCN-NEXT: s_movk_i32 s19, 0xf0
|
||||
; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s38
|
||||
; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s39
|
||||
; GCN-NEXT: v_add_u32_e32 v64, s17, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s40
|
||||
; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s41
|
||||
; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s42
|
||||
; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s43
|
||||
; GCN-NEXT: v_add_u32_e32 v68, s18, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s44
|
||||
; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v63, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v64, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v65, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v66, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v14, v67, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v16, v68, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v69, 0xe4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s45
|
||||
; GCN-NEXT: v_add_u32_e32 v70, 0xe8, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s46
|
||||
; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s47
|
||||
; GCN-NEXT: v_add_u32_e32 v72, s19, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s48
|
||||
; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s49
|
||||
; GCN-NEXT: v_add_u32_e32 v74, 0xf8, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s50
|
||||
; GCN-NEXT: buffer_store_dword v4, v69, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v70, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v71, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v72, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v73, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v74, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s26
|
||||
; GCN-NEXT: v_add_u32_e32 v17, 8, v16
|
||||
; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s27
|
||||
; GCN-NEXT: s_lshl_b32 s7, s7, 2
|
||||
; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s51
|
||||
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256
|
||||
; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GCN-NEXT: v_add_u32_e32 v0, s7, v0
|
||||
; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v4, v7, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v5, v9, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v6, v11, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v7, v13, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v8, v15, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v9, v17, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v10, v19, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v11, v21, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v12, v23, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v13, v25, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v14, v27, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v15, v29, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_add_u32_e32 v18, 12, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v19, 16, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v20, 20, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v21, 24, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v22, 28, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v23, 32, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v24, 36, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v25, 40, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v26, 44, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v27, 48, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v28, 52, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v29, 56, v16
|
||||
; GCN-NEXT: v_add_u32_e32 v30, 60, v16
|
||||
; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: v_add_u32_e32 v1, s7, v16
|
||||
; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v11, v26, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v5, v20, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v6, v21, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v7, v22, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v8, v23, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v9, v24, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v10, v25, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v11, v26, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v12, v27, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v13, v28, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v14, v29, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v15, v30, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v16, v31, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v17, v33, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v18, v35, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v19, v37, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v20, v38, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v21, v39, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v22, v40, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v23, v41, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v24, v42, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v25, v26, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v26, v28, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v27, v30, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v28, v32, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v29, v34, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v30, v36, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v31, v43, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v32, v44, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v33, v45, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v34, v46, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v35, v47, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v36, v48, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v37, v49, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v38, v50, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v39, v51, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v40, v52, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v41, v53, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v42, v54, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v43, v55, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v44, v56, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v45, v57, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v46, v58, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v47, v59, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v48, v60, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v49, v61, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v50, v62, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v51, v63, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v52, v64, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v53, v65, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v54, v66, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v55, v67, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v56, v68, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v57, v69, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v58, v70, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v59, v71, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v60, v72, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v61, v73, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v62, v74, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v63, v75, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v17, v32, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v18, v33, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v19, v34, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v20, v35, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v21, v36, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v22, v37, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v23, v38, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v24, v39, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v25, v40, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v26, v41, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v27, v42, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v28, v43, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v29, v44, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v30, v45, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v31, v46, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v32, v47, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v33, v48, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v34, v49, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v35, v50, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v36, v51, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v37, v52, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v38, v53, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v39, v54, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v40, v55, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v41, v56, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v42, v57, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v43, v58, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v44, v59, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v45, v60, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v46, v61, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v47, v62, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v48, v63, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v49, v64, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v50, v65, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v51, v66, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v52, v67, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v53, v68, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v54, v69, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v55, v70, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v56, v71, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v57, v72, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v58, v73, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v59, v74, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v60, v75, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v61, v76, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256
|
||||
; GCN-NEXT: s_add_u32 s6, s8, 16
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v67, s7
|
||||
; GCN-NEXT: v_mov_b32_e32 v66, s6
|
||||
; GCN-NEXT: s_add_u32 s6, s8, 32
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v65, s9
|
||||
; GCN-NEXT: s_add_u32 s10, s8, 48
|
||||
; GCN-NEXT: s_add_u32 s6, s8, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v64, s8
|
||||
; GCN-NEXT: s_addc_u32 s11, s9, 0
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GCN-NEXT: s_add_u32 s6, s8, 32
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GCN-NEXT: s_add_u32 s6, s8, 48
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GCN-NEXT: s_add_u32 s6, s8, 64
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: s_add_u32 s10, s8, s4
|
||||
; GCN-NEXT: s_addc_u32 s11, s9, 0
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s5
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GCN-NEXT: s_add_u32 s6, s8, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off
|
||||
; GCN-NEXT: s_add_u32 s6, s8, s4
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[20:23], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s10
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, 0x80
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[28:31], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GCN-NEXT: s_add_u32 s6, s8, s14
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GCN-NEXT: s_add_u32 s6, s8, s16
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off
|
||||
; GCN-NEXT: s_addc_u32 s7, s9, 0
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s28
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s29
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, 0xc0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s17
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s18
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s12
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s19
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s13
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s14
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[2:3], v[60:63], off
|
||||
; GCN-NEXT: s_addc_u32 s5, s9, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: global_store_dwordx4 v[0:1], v[60:63], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
|
||||
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
|
||||
|
|
|
@ -1954,7 +1954,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: s_lshr_b32 s7, s5, 1
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s7, 1
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_cselect_b32 s0, s9, s8
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s7, 2
|
||||
|
@ -1997,16 +1997,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
|
||||
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
|
||||
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: insertelement_s_v16i16_s_s:
|
||||
|
@ -2015,7 +2015,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: s_lshr_b32 s7, s5, 1
|
||||
; GFX8-NEXT: s_cmp_eq_u32 s7, 1
|
||||
; GFX8-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s9, s8
|
||||
; GFX8-NEXT: s_cmp_eq_u32 s7, 2
|
||||
|
@ -2058,16 +2058,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: insertelement_s_v16i16_s_s:
|
||||
|
@ -2108,24 +2108,25 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX7-NEXT: s_cmp_eq_u32 s7, 4
|
||||
; GFX7-NEXT: s_cselect_b32 s4, s16, s12
|
||||
; GFX7-NEXT: s_cmp_eq_u32 s7, 5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_cselect_b32 s5, s16, s13
|
||||
; GFX7-NEXT: s_cmp_eq_u32 s7, 6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_cselect_b32 s6, s16, s14
|
||||
; GFX7-NEXT: s_cmp_eq_u32 s7, 7
|
||||
; GFX7-NEXT: s_cselect_b32 s7, s16, s15
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_mov_b32 s10, -1
|
||||
; GFX7-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s7
|
||||
; GFX7-NEXT: s_cselect_b32 s7, s16, s15
|
||||
; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; GFX7-NEXT: s_nop 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
||||
; GFX7-NEXT: s_endpgm
|
||||
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
|
||||
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
|
||||
|
@ -2329,23 +2330,23 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s14
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
|
||||
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: insertelement_s_v16i16_v_s:
|
||||
|
@ -2390,23 +2391,23 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s14
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: insertelement_s_v16i16_v_s:
|
||||
|
@ -2509,8 +2510,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s17
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s20
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s21
|
||||
|
@ -2518,8 +2519,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_mov_b32_e32 v7, s23
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
|
||||
|
@ -2528,11 +2527,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
|
||||
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: insertelement_s_v16i16_s_v:
|
||||
|
@ -2572,8 +2573,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v9, v0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s17
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s20
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s21
|
||||
|
@ -2581,8 +2582,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_mov_b32_e32 v7, s23
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
|
||||
|
@ -2591,11 +2590,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: insertelement_s_v16i16_s_v:
|
||||
|
@ -2699,8 +2700,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s17
|
||||
|
@ -2708,8 +2709,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_mov_b32_e32 v7, s19
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
|
||||
|
@ -2718,11 +2717,13 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX9-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
|
||||
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
|
||||
; GFX9-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: insertelement_s_v16i16_v_v:
|
||||
|
@ -2761,8 +2762,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s17
|
||||
|
@ -2770,8 +2771,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_mov_b32_e32 v7, s19
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
|
||||
|
@ -2780,11 +2779,13 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
|
|||
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX8-NEXT: s_add_u32 s0, 0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GFX8-NEXT: s_addc_u32 s1, 0, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: insertelement_s_v16i16_v_v:
|
||||
|
|
|
@ -8,39 +8,39 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x80
|
||||
; GCN-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v0, v64
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
||||
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v6
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2
|
||||
; GCN-NEXT: s_movk_i32 s4, 0xc0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v6, v4
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v7, v5, vcc
|
||||
; GCN-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:32
|
||||
; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v0, v64
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v4
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
|
||||
; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32
|
||||
; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
|
||||
; GCN-NEXT: s_movk_i32 s4, 0xc0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
|
||||
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
|
||||
; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1]
|
||||
; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:32
|
||||
; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:48
|
||||
; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:64
|
||||
; GCN-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:48
|
||||
; GCN-NEXT: global_load_dwordx4 v[20:23], v[16:17], off offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[24:27], v[16:17], off offset:32
|
||||
; GCN-NEXT: global_load_dwordx4 v[28:31], v[16:17], off offset:48
|
||||
; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
|
||||
; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
|
||||
; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32
|
||||
; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48
|
||||
; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
|
||||
; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
|
||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
||||
; GCN-NEXT: s_waitcnt vmcnt(7)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7
|
||||
; GCN-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
|
||||
|
@ -55,8 +55,8 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:64
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:80
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:96
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:112
|
||||
|
|
|
@ -177,35 +177,35 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
|
|||
; GFX7-LABEL: store_lds_v4i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v7 offset:3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v8 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v0, v9 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v10 offset:7
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:7
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v3 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:11
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:12
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:13
|
||||
; GFX7-NEXT: ds_write_b8 v0, v7 offset:14
|
||||
; GFX7-NEXT: ds_write_b8 v0, v8 offset:15
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:13
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:14
|
||||
; GFX7-NEXT: ds_write_b8 v0, v3 offset:15
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
|
||||
|
@ -227,17 +227,17 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v7 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v0, v8 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v9 offset:7
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:7
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v3
|
||||
|
|
|
@ -43,50 +43,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s7, s0, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s1, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s3, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX9-NEXT: ds_write_b8 v1, v5 offset:12
|
||||
; GFX9-NEXT: ds_write_b8 v1, v6 offset:13
|
||||
; GFX9-NEXT: ds_write_b8 v1, v7 offset:14
|
||||
; GFX9-NEXT: ds_write_b8 v1, v8 offset:15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s3, 24
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:15
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align1:
|
||||
|
@ -96,50 +96,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s7, s0, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s1, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s1, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s3, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX7-NEXT: ds_write_b8 v1, v5 offset:12
|
||||
; GFX7-NEXT: ds_write_b8 v1, v6 offset:13
|
||||
; GFX7-NEXT: ds_write_b8 v1, v7 offset:14
|
||||
; GFX7-NEXT: ds_write_b8 v1, v8 offset:15
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:12
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:13
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s3, 24
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:14
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:15
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
|
@ -152,26 +152,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX9-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX9-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX9-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX9-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX9-NEXT: ds_write_b16 v1, v7 offset:12
|
||||
; GFX9-NEXT: ds_write_b16 v1, v8 offset:14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:14
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align2:
|
||||
|
@ -181,26 +181,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX7-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX7-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX7-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX7-NEXT: ds_write_b16 v1, v7 offset:12
|
||||
; GFX7-NEXT: ds_write_b16 v1, v8 offset:14
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:12
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:14
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
|
||||
ret void
|
||||
|
|
|
@ -41,39 +41,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s1, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align1:
|
||||
|
@ -83,39 +83,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s6, s0, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s1, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
|
@ -128,21 +128,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX9-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX9-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX9-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX9-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0 offset:10
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align2:
|
||||
|
@ -152,21 +152,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX7-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX7-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX7-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0 offset:10
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
|
||||
ret void
|
||||
|
|
|
@ -3316,13 +3316,14 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GCN-NEXT: v_and_b32_e32 v2, s3, v3
|
||||
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc
|
||||
; GCN-NEXT: v_and_b32_e32 v3, s3, v4
|
||||
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
|
||||
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
|
||||
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_endpgm
|
||||
%r = udiv <3 x i15> %x, %y
|
||||
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
|
||||
|
@ -3460,9 +3461,10 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_endpgm
|
||||
%r = urem <3 x i15> %x, %y
|
||||
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
|
||||
|
@ -3612,9 +3614,10 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_endpgm
|
||||
%r = sdiv <3 x i15> %x, %y
|
||||
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
|
||||
|
@ -3780,13 +3783,14 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3
|
||||
; GCN-NEXT: v_and_b32_e32 v3, s3, v3
|
||||
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0
|
||||
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
|
||||
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
|
||||
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GCN-NEXT: s_endpgm
|
||||
%r = srem <3 x i15> %x, %y
|
||||
store <3 x i15> %r, <3 x i15> addrspace(1)* %out
|
||||
|
|
|
@ -744,13 +744,13 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}tail_call_byval_align16:
|
||||
; GCN-NOT: s32
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
|
||||
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
|
||||
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
|
||||
|
||||
; GCN: s_getpc_b64
|
||||
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64
|
||||
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
|
||||
|
@ -777,12 +777,12 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}stack_12xv3i32:
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 v31, 11
|
||||
; GCN: s_getpc
|
||||
|
@ -806,12 +806,12 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}stack_12xv3f32:
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 v31, 0x41300000
|
||||
; GCN: s_getpc
|
||||
|
@ -836,20 +836,20 @@ entry:
|
|||
; GCN-LABEL: {{^}}stack_8xv5i32:
|
||||
|
||||
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
|
||||
; GCN: v_mov_b32_e32 v31, 7
|
||||
|
@ -870,20 +870,20 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}stack_8xv5f32:
|
||||
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
|
||||
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
|
||||
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
|
||||
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
|
||||
; GCN: v_mov_b32_e32 v31, 0x40e00000
|
||||
|
|
|
@ -31,9 +31,7 @@ bb:
|
|||
%la3 = getelementptr inbounds i32, i32* %lb, i32 6
|
||||
%ld3 = load i32, i32* %la3
|
||||
|
||||
; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
|
||||
; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
|
||||
; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
|
||||
; DBG-NOT: Cluster ld/st
|
||||
; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
|
||||
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8
|
||||
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
|
||||
|
@ -78,13 +76,11 @@ bb:
|
|||
%la3 = getelementptr inbounds i32, i32* %lb, i32 6
|
||||
%ld3 = load i32, i32* %la3
|
||||
|
||||
; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
|
||||
; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
|
||||
; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
|
||||
; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
|
||||
; DBG-NOT: Cluster ld/st
|
||||
; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
|
||||
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
|
||||
; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
|
||||
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
|
||||
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
|
||||
; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
|
||||
%sa0 = getelementptr inbounds i32, i32* %sb, i32 0
|
||||
store i32 %ld0, i32* %sa0
|
||||
|
@ -125,7 +121,6 @@ entry:
|
|||
; CHECK-LABEL: {{^}}no_cluster_image_load:
|
||||
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
|
||||
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
|
||||
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
|
||||
; DBG-NOT: {{^}}Cluster ld/st
|
||||
define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) {
|
||||
entry:
|
||||
|
|
|
@ -156,28 +156,28 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a
|
|||
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
|
||||
; GFX7-ALIGNED: ; %bb.0:
|
||||
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
|
||||
; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
|
||||
; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v4
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v5
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
|
||||
; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 2
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v5
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v4
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
|
||||
; GFX7-ALIGNED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
|
||||
|
|
|
@ -73,9 +73,9 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
|
|||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
|
||||
; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
|
||||
|
@ -140,14 +140,14 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
|
|||
; GFX7-ALIGNED: ; %bb.0:
|
||||
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1
|
||||
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
|
||||
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen
|
||||
; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen
|
||||
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
|
||||
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1
|
||||
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen
|
||||
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen
|
||||
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen
|
||||
; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
|
||||
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
|
||||
|
|
|
@ -1084,23 +1084,23 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
|
|||
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7
|
||||
; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
|
||||
; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
||||
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
|
||||
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6
|
||||
; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
|
||||
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, 8, v1
|
||||
; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
|
||||
; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2
|
||||
; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
|
||||
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
|
||||
; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
|
||||
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
|
||||
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
|
||||
; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2
|
||||
; GFX9-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
|
|
@ -312,7 +312,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
|
|||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
|
||||
; GCN: flat_store_dwordx4
|
||||
|
||||
|
@ -326,6 +325,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
|
|||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
|
||||
; VI: v_cvt_f32_f16_e32
|
||||
; VI: v_cvt_f32_f16_sdwa
|
||||
|
|
|
@ -773,12 +773,13 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
|
|||
; VI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s5
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
||||
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
||||
; VI-NEXT: s_nop 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%vecins = insertelement <8 x i32> %a, i32 5, i32 %b
|
||||
store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
|
||||
|
@ -910,9 +911,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
|
|||
; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: dynamic_insertelement_v3i16:
|
||||
|
|
|
@ -45,7 +45,7 @@ entry:
|
|||
|
||||
; GCN: s_barrier
|
||||
|
||||
; SI: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
|
||||
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
|
||||
|
|
|
@ -70,16 +70,16 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
|
|||
; GCN-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s7
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
|
|
|
@ -529,8 +529,8 @@ define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %
|
|||
; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
|
||||
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
|
||||
; GCN: buffer_store_dword v[[HI]]
|
||||
define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
|
||||
store i32 9, i32 addrspace(1)* %out, align 4
|
||||
|
|
|
@ -28,14 +28,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
|||
; GCN-NEXT: s_cbranch_scc1 BB0_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb.1
|
||||
; GCN-NEXT: s_add_i32 s6, s32, 0x1000
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: s_lshl_b32 s7, s10, 2
|
||||
; GCN-NEXT: s_mov_b32 s32, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_add_i32 s6, s6, s7
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 1
|
||||
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GCN-NEXT: s_add_i32 s6, s6, s7
|
||||
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
|
@ -98,14 +98,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
|||
; GCN-NEXT: ; %bb.1: ; %bb.0
|
||||
; GCN-NEXT: s_add_i32 s6, s32, 0x1000
|
||||
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: s_lshl_b32 s7, s7, 2
|
||||
; GCN-NEXT: s_mov_b32 s32, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_add_i32 s6, s6, s7
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 1
|
||||
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GCN-NEXT: s_add_i32 s6, s6, s7
|
||||
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
|
@ -166,9 +166,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
|||
; GCN-NEXT: s_add_i32 s6, s32, 0x1000
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 1
|
||||
; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_lshl_add_u32 v2, v4, 2, s6
|
||||
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5
|
||||
|
@ -228,9 +228,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
|||
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 1
|
||||
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
|
||||
; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_lshl_add_u32 v2, v3, 2, s6
|
||||
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4
|
||||
|
|
|
@ -249,13 +249,13 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
|
|||
; CI: v_mov_b32
|
||||
; CI: v_mov_b32
|
||||
|
||||
; CI: v_add_i32
|
||||
; CI: v_add_i32
|
||||
; CI-DAG: v_add_i32
|
||||
; CI-DAG: v_add_i32
|
||||
|
||||
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
|
||||
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
|
||||
; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
|
||||
; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
|
||||
; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
|
||||
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
|
||||
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
|
||||
|
|
|
@ -55,42 +55,42 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s2, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s3, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:12
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
|
||||
; GFX9-NEXT: ds_write_b8 v0, v2 offset:8
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX9-NEXT: ds_write_b8 v0, v6
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v6 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:12
|
||||
; GFX9-NEXT: ds_write_b8 v0, v5 offset:4
|
||||
; GFX9-NEXT: ds_write_b8 v0, v2 offset:13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
|
||||
; GFX9-NEXT: ds_write_b8 v0, v2 offset:15
|
||||
; GFX9-NEXT: ds_write_b8 v0, v3 offset:9
|
||||
; GFX9-NEXT: ds_write_b8 v0, v4 offset:11
|
||||
; GFX9-NEXT: ds_write_b8 v0, v6 offset:5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:4
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v5 offset:6
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s3, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX9-NEXT: ds_write_b8 v0, v7 offset:1
|
||||
; GFX9-NEXT: ds_write_b8 v0, v8 offset:3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align1:
|
||||
|
@ -100,50 +100,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s3, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:13
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:15
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:14
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:9
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:12
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:11
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s3, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:13
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:15
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:14
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s0, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b8 v0, v3 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-LABEL: store_lds_v4i32_align1:
|
||||
|
@ -153,50 +153,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GFX6-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v5 offset:13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GFX6-NEXT: ds_write_b8 v0, v5 offset:15
|
||||
; GFX6-NEXT: ds_write_b8 v0, v6 offset:14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 16
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:8
|
||||
; GFX6-NEXT: ds_write_b8 v0, v5 offset:9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:12
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:11
|
||||
; GFX6-NEXT: ds_write_b8 v0, v6 offset:10
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NEXT: s_lshr_b32 s4, s3, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:15
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:6
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v4
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s0, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b8 v0, v3 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
|
@ -210,17 +210,17 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
|
||||
; GFX9-NEXT: ds_write_b16 v0, v4
|
||||
; GFX9-NEXT: ds_write_b16 v0, v3 offset:4
|
||||
; GFX9-NEXT: ds_write_b16 v0, v2 offset:8
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1 offset:12
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
|
||||
; GFX9-NEXT: ds_write_b16 v0, v2 offset:8
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:6
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v4 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align2:
|
||||
|
@ -230,26 +230,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: ds_write_b16 v0, v4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: ds_write_b16 v0, v3 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v2 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: ds_write_b16 v0, v2 offset:14
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:12
|
||||
; GFX7-NEXT: ds_write_b16 v0, v3 offset:10
|
||||
; GFX7-NEXT: ds_write_b16 v0, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b16 v0, v5 offset:2
|
||||
; GFX7-NEXT: ds_write_b16 v0, v2 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:14
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-LABEL: store_lds_v4i32_align2:
|
||||
|
@ -259,26 +259,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: ds_write_b16 v0, v4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX6-NEXT: ds_write_b16 v0, v3 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v2 offset:8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: ds_write_b16 v0, v2 offset:14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:12
|
||||
; GFX6-NEXT: ds_write_b16 v0, v3 offset:10
|
||||
; GFX6-NEXT: ds_write_b16 v0, v4 offset:6
|
||||
; GFX6-NEXT: ds_write_b16 v0, v5 offset:2
|
||||
; GFX6-NEXT: ds_write_b16 v0, v2 offset:8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
|
||||
ret void
|
||||
|
@ -307,10 +307,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
|
||||
; GFX7-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-LABEL: store_lds_v4i32_align4:
|
||||
|
|
|
@ -36,10 +36,10 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX6-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out
|
||||
|
@ -53,33 +53,33 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: ds_write_b8 v0, v4
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v4 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v3 offset:4
|
||||
; GFX9-NEXT: ds_write_b8 v0, v2 offset:9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 24
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
|
||||
; GFX9-NEXT: ds_write_b8 v0, v2 offset:11
|
||||
; GFX9-NEXT: ds_write_b8 v0, v4 offset:5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX9-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v3 offset:6
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX9-NEXT: ds_write_b8 v0, v5 offset:1
|
||||
; GFX9-NEXT: ds_write_b8 v0, v6 offset:3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align1:
|
||||
|
@ -89,39 +89,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s1, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v3
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s0, 24
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-LABEL: store_lds_v3i32_align1:
|
||||
|
@ -131,39 +131,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 16
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:8
|
||||
; GFX6-NEXT: ds_write_b8 v0, v4 offset:9
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
|
||||
; GFX6-NEXT: ds_write_b8 v0, v5 offset:10
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s1, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
|
||||
; GFX6-NEXT: ds_write_b8 v0, v4 offset:6
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v3
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s0, 24
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
|
||||
; GFX6-NEXT: ds_write_b8 v0, v4 offset:2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
|
@ -178,13 +178,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
|
|||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
|
||||
; GFX9-NEXT: ds_write_b16 v0, v3
|
||||
; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1 offset:8
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
|
||||
; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1
|
||||
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align2:
|
||||
|
@ -194,21 +194,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: ds_write_b16 v0, v3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: ds_write_b16 v0, v2 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:8
|
||||
; GFX7-NEXT: ds_write_b16 v0, v3 offset:6
|
||||
; GFX7-NEXT: ds_write_b16 v0, v4 offset:2
|
||||
; GFX7-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-LABEL: store_lds_v3i32_align2:
|
||||
|
@ -218,21 +218,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: ds_write_b16 v0, v3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: ds_write_b16 v0, v2 offset:10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:8
|
||||
; GFX6-NEXT: ds_write_b16 v0, v3 offset:6
|
||||
; GFX6-NEXT: ds_write_b16 v0, v4 offset:2
|
||||
; GFX6-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
|
||||
ret void
|
||||
|
@ -260,9 +260,9 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
|
||||
; GFX7-NEXT: ds_write_b32 v0, v3 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: ds_write_b32 v0, v1 offset:8
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-LABEL: store_lds_v3i32_align4:
|
||||
|
@ -302,10 +302,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
|
|||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: ds_write_b32 v2, v1 offset:8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX7-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -316,10 +316,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX6-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
|
||||
|
@ -359,10 +359,10 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out,
|
|||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX6-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX6-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
|
||||
|
|
|
@ -6,14 +6,14 @@
|
|||
define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
|
||||
; CIVI-LABEL: local_store_i56:
|
||||
; CIVI: ; %bb.0:
|
||||
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CIVI-NEXT: s_mov_b32 m0, -1
|
||||
; CIVI-NEXT: ds_write_b32 v0, v1
|
||||
; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_setpc_b64 s[30:31]
|
||||
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CIVI-NEXT: s_mov_b32 m0, -1
|
||||
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
|
||||
; CIVI-NEXT: ds_write_b32 v0, v1
|
||||
; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: local_store_i56:
|
||||
; GFX9: ; %bb.0:
|
||||
|
@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
|
|||
define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
|
||||
; HAWAII-LABEL: local_store_i55:
|
||||
; HAWAII: ; %bb.0:
|
||||
; HAWAII-NEXT: s_or_b32 s0, s4, 14
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
|
||||
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
|
||||
; HAWAII-NEXT: s_mov_b32 m0, -1
|
||||
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
|
||||
; HAWAII-NEXT: s_waitcnt vmcnt(0)
|
||||
; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
|
||||
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; HAWAII-NEXT: ds_write_b32 v1, v2
|
||||
; HAWAII-NEXT: s_endpgm
|
||||
; HAWAII-NEXT: s_or_b32 s0, s4, 14
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
|
||||
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
|
||||
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; HAWAII-NEXT: s_mov_b32 m0, -1
|
||||
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
|
||||
; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; HAWAII-NEXT: s_waitcnt vmcnt(0)
|
||||
; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
|
||||
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; HAWAII-NEXT: ds_write_b32 v1, v2
|
||||
; HAWAII-NEXT: s_endpgm
|
||||
;
|
||||
; FIJI-LABEL: local_store_i55:
|
||||
; FIJI: ; %bb.0:
|
||||
; FIJI-NEXT: s_or_b32 s0, s4, 14
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
||||
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
|
||||
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
|
||||
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
|
||||
; FIJI-NEXT: ds_write_b32 v1, v3
|
||||
; FIJI-NEXT: s_endpgm
|
||||
; FIJI-NEXT: s_or_b32 s0, s4, 14
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
|
||||
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
||||
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
|
||||
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
|
||||
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; FIJI-NEXT: ds_write_b32 v1, v3
|
||||
; FIJI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: local_store_i55:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
|
||||
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_or_b32_e32 v2, s3, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; GFX9-NEXT: ds_write_b32 v0, v3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
|
||||
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_or_b32_e32 v1, s3, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
|
||||
; GFX9-NEXT: ds_write_b32 v0, v3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
store i55 %arg, i55 addrspace(3)* %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
|
|||
define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 {
|
||||
; HAWAII-LABEL: local_store_i48:
|
||||
; HAWAII: ; %bb.0:
|
||||
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
|
||||
; HAWAII-NEXT: s_mov_b32 m0, -1
|
||||
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s2
|
||||
; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; HAWAII-NEXT: ds_write_b32 v0, v2
|
||||
; HAWAII-NEXT: s_endpgm
|
||||
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
|
||||
; HAWAII-NEXT: s_mov_b32 m0, -1
|
||||
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s2
|
||||
; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
|
||||
; HAWAII-NEXT: ds_write_b32 v0, v1
|
||||
; HAWAII-NEXT: s_endpgm
|
||||
;
|
||||
; FIJI-LABEL: local_store_i48:
|
||||
; FIJI: ; %bb.0:
|
||||
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; FIJI-NEXT: ds_write_b32 v0, v2
|
||||
; FIJI-NEXT: s_endpgm
|
||||
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FIJI-NEXT: ds_write_b32 v0, v1
|
||||
; FIJI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: local_store_i48:
|
||||
; GFX9: ; %bb.0:
|
||||
|
@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
|
|||
define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
|
||||
; HAWAII-LABEL: local_store_i65:
|
||||
; HAWAII: ; %bb.0:
|
||||
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4
|
||||
; HAWAII-NEXT: s_mov_b32 m0, -1
|
||||
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
|
||||
; HAWAII-NEXT: s_and_b32 s3, s3, 1
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v3, s3
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
|
||||
; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8
|
||||
; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; HAWAII-NEXT: s_endpgm
|
||||
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4
|
||||
; HAWAII-NEXT: s_mov_b32 m0, -1
|
||||
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
|
||||
; HAWAII-NEXT: s_and_b32 s3, s3, 1
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s3
|
||||
; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
|
||||
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
|
||||
; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; HAWAII-NEXT: s_endpgm
|
||||
;
|
||||
; FIJI-LABEL: local_store_i65:
|
||||
; FIJI: ; %bb.0:
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: s_and_b32 s3, s3, 1
|
||||
; FIJI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FIJI-NEXT: ds_write_b8 v2, v3 offset:8
|
||||
; FIJI-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; FIJI-NEXT: s_endpgm
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; FIJI-NEXT: s_and_b32 s3, s3, 1
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; FIJI-NEXT: ds_write_b8 v2, v0 offset:8
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FIJI-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; FIJI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: local_store_i65:
|
||||
; GFX9: ; %bb.0:
|
||||
|
@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
|
|||
define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
|
||||
; CIVI-LABEL: local_store_i17:
|
||||
; CIVI: ; %bb.0:
|
||||
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CIVI-NEXT: s_mov_b32 m0, -1
|
||||
; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; CIVI-NEXT: ds_write_b16 v0, v1
|
||||
; CIVI-NEXT: ds_write_b8 v0, v2 offset:2
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_setpc_b64 s[30:31]
|
||||
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CIVI-NEXT: s_mov_b32 m0, -1
|
||||
; CIVI-NEXT: ds_write_b16 v0, v1
|
||||
; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1
|
||||
; CIVI-NEXT: ds_write_b8 v0, v1 offset:2
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: local_store_i17:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
store i17 %arg, i17 addrspace(3)* %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -5,37 +5,37 @@
|
|||
; GCN-LABEL: {{^}}token_factor_inline_limit_test:
|
||||
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}}
|
||||
|
||||
; GCN: v_mov_b32_e32 v31, 7
|
||||
|
|
|
@ -135,12 +135,13 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
|
|||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_add_i32 s7, s7, 34
|
||||
; SI-NEXT: s_or_b32 s7, s7, 4
|
||||
; SI-NEXT: s_bfe_u32 s8, s7, 0x10010
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; SI-NEXT: s_bfe_u32 s8, s7, 0x10010
|
||||
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: widen_i17_constant_load:
|
||||
|
@ -157,9 +158,9 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
|
|||
; VI-NEXT: s_or_b32 s0, s0, 4
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: s_bfe_u32 s0, s0, 0x10010
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; VI-NEXT: flat_store_short v[0:1], v4
|
||||
; VI-NEXT: flat_store_byte v[2:3], v5
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: flat_store_byte v[2:3], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
%load = load i17, i17 addrspace(4)* %arg, align 4
|
||||
%add = add i17 %load, 34
|
||||
|
|
Loading…
Reference in New Issue