forked from OSchip/llvm-project
[AMDGPU] Skip invariant loads when avoiding WAR conflicts
No need to handle invariant loads when avoiding WAR conflicts, as there cannot be a vector store to the same memory location. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D101177
This commit is contained in:
parent
cbd93cee9b
commit
68137ef568
|
@ -1521,8 +1521,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
|
||||
if (TII->isSMRD(Inst)) {
|
||||
for (const MachineMemOperand *Memop : Inst.memoperands()) {
|
||||
const Value *Ptr = Memop->getValue();
|
||||
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
|
||||
// No need to handle invariant loads when avoiding WAR conflicts, as
|
||||
// there cannot be a vector store to the same memory location.
|
||||
if (!Memop->isInvariant()) {
|
||||
const Value *Ptr = Memop->getValue();
|
||||
assert(Ptr);
|
||||
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
|
||||
}
|
||||
}
|
||||
if (ST->hasReadVCCZBug()) {
|
||||
// This smem read could complete and clobber vccz at any time.
|
||||
|
|
|
@ -187,49 +187,49 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
|
|||
define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
|
||||
; GFX9-LABEL: slsr1_1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s33, 4
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v40, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s30, 2
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s31, 3
|
||||
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v40
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v43, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v43, 3
|
||||
; GFX9-NEXT: v_readlane_b32 s35, v43, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v43, 0
|
||||
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v43, 4
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s33, 4
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v40, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s30, 2
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s31, 3
|
||||
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v40
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v43, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v43, 3
|
||||
; GFX9-NEXT: v_readlane_b32 s35, v43, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v43, 0
|
||||
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v43, 4
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
%b = and i32 %b.arg, 16777215
|
||||
%s = and i32 %s.arg, 16777215
|
||||
|
||||
|
|
|
@ -169,7 +169,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
|
|||
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x30000
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
|
||||
; GCN: s_sub_u32 s32, s32, 0x30000
|
||||
; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2
|
||||
|
|
|
@ -23,3 +23,18 @@ body: |
|
|||
S_BRANCH %bb.1
|
||||
|
||||
...
|
||||
|
||||
# Check that the waitcnt pass does *not* insert a waitcnt inst after S_BUFFER_LOAD.
|
||||
# WAR hazard does not apply here, because S_BUFFER_LOAD accesses invariant memory.
|
||||
...
|
||||
# CHECK-LABEL: name: waitcnt-no-war-wait
|
||||
# CHECK: S_WAITCNT 0
|
||||
# CHECK-NEXT: S_BUFFER_LOAD_DWORD_IMM
|
||||
# CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact
|
||||
name: waitcnt-no-war-wait
|
||||
body: |
|
||||
bb.0:
|
||||
renamable $sgpr8 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr0_sgpr1_sgpr2_sgpr3, 276, 0 :: (dereferenceable invariant load 4)
|
||||
TBUFFER_STORE_FORMAT_X_OFFEN_exact killed renamable $vgpr0, renamable $vgpr15, renamable $sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr9, 0, 116, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue