[AMDGPU] Skip invariant loads when avoiding WAR conflicts

No need to handle invariant loads when avoiding WAR conflicts, as
there cannot be a vector store to the same memory location.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D101177
This commit is contained in:
Piotr Sobczak 2021-05-12 09:23:59 +02:00
parent cbd93cee9b
commit 68137ef568
4 changed files with 66 additions and 46 deletions

View File

@ -1521,8 +1521,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (TII->isSMRD(Inst)) { if (TII->isSMRD(Inst)) {
for (const MachineMemOperand *Memop : Inst.memoperands()) { for (const MachineMemOperand *Memop : Inst.memoperands()) {
const Value *Ptr = Memop->getValue(); // No need to handle invariant loads when avoiding WAR conflicts, as
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); // there cannot be a vector store to the same memory location.
if (!Memop->isInvariant()) {
const Value *Ptr = Memop->getValue();
assert(Ptr);
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
}
} }
if (ST->hasReadVCCZBug()) { if (ST->hasReadVCCZBug()) {
// This smem read could complete and clobber vccz at any time. // This smem read could complete and clobber vccz at any time.

View File

@ -187,49 +187,49 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-LABEL: slsr1_1: ; GFX9-LABEL: slsr1_1:
; GFX9: ; %bb.0: ; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v43, s33, 4 ; GFX9-NEXT: v_writelane_b32 v43, s33, 4
; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_add_u32 s32, s32, 0x800 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800
; GFX9-NEXT: v_writelane_b32 v43, s34, 0 ; GFX9-NEXT: v_writelane_b32 v43, s34, 0
; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
; GFX9-NEXT: v_writelane_b32 v43, s35, 1 ; GFX9-NEXT: v_writelane_b32 v43, s35, 1
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, v1
; GFX9-NEXT: v_mov_b32_e32 v40, v1 ; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_writelane_b32 v43, s30, 2
; GFX9-NEXT: v_writelane_b32 v43, s30, 2 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 ; GFX9-NEXT: v_writelane_b32 v43, s31, 3
; GFX9-NEXT: v_writelane_b32 v43, s31, 3 ; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 ; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: v_mov_b32_e32 v0, v40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 ; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s4, v43, 2 ; GFX9-NEXT: v_readlane_b32 s4, v43, 2
; GFX9-NEXT: v_readlane_b32 s5, v43, 3 ; GFX9-NEXT: v_readlane_b32 s5, v43, 3
; GFX9-NEXT: v_readlane_b32 s35, v43, 1 ; GFX9-NEXT: v_readlane_b32 s35, v43, 1
; GFX9-NEXT: v_readlane_b32 s34, v43, 0 ; GFX9-NEXT: v_readlane_b32 s34, v43, 0
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
; GFX9-NEXT: v_readlane_b32 s33, v43, 4 ; GFX9-NEXT: v_readlane_b32 s33, v43, 4
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[4:5] ; GFX9-NEXT: s_setpc_b64 s[4:5]
%b = and i32 %b.arg, 16777215 %b = and i32 %b.arg, 16777215
%s = and i32 %s.arg, 16777215 %s = and i32 %s.arg, 16777215

View File

@ -169,7 +169,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
; GCN-DAG: s_add_u32 s32, s32, 0x30000 ; GCN-DAG: s_add_u32 s32, s32, 0x30000
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN: s_sub_u32 s32, s32, 0x30000 ; GCN: s_sub_u32 s32, s32, 0x30000
; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 ; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2

View File

@ -23,3 +23,18 @@ body: |
S_BRANCH %bb.1 S_BRANCH %bb.1
... ...
# Check that the waitcnt pass does *not* insert a waitcnt inst after S_BUFFER_LOAD.
# WAR hazard does not apply here, because S_BUFFER_LOAD accesses invariant memory.
...
# CHECK-LABEL: name: waitcnt-no-war-wait
# CHECK: S_WAITCNT 0
# CHECK-NEXT: S_BUFFER_LOAD_DWORD_IMM
# CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact
name: waitcnt-no-war-wait
body: |
bb.0:
renamable $sgpr8 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr0_sgpr1_sgpr2_sgpr3, 276, 0 :: (dereferenceable invariant load 4)
TBUFFER_STORE_FORMAT_X_OFFEN_exact killed renamable $vgpr0, renamable $vgpr15, renamable $sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr9, 0, 116, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
...