diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 8017fb377525..bfac7df11df3 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1521,8 +1521,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, if (TII->isSMRD(Inst)) { for (const MachineMemOperand *Memop : Inst.memoperands()) { - const Value *Ptr = Memop->getValue(); - SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + // No need to handle invariant loads when avoiding WAR conflicts, as + // there cannot be a vector store to the same memory location. + if (!Memop->isInvariant()) { + const Value *Ptr = Memop->getValue(); + assert(Ptr); + SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + } } if (ST->hasReadVCCZBug()) { // This smem read could complete and clobber vccz at any time. diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index cffd7e4e62b1..3226e54b3568 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -187,49 +187,49 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 { define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-LABEL: slsr1_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s33, 4 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v43, s34, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: v_writelane_b32 v43, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 -; GFX9-NEXT: v_writelane_b32 v43, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, v40 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v43, 2 -; GFX9-NEXT: v_readlane_b32 s5, v43, 3 -; GFX9-NEXT: v_readlane_b32 s35, v43, 1 -; GFX9-NEXT: v_readlane_b32 s34, v43, 0 -; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s33, v43, 4 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v43, s33, 4 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v43, s35, 1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_writelane_b32 v43, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_writelane_b32 v43, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v43, 2 +; GFX9-NEXT: v_readlane_b32 s5, v43, 3 +; GFX9-NEXT: v_readlane_b32 s35, v43, 1 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 +; GFX9-NEXT: v_readlane_b32 s33, v43, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] %b = and i32 %b.arg, 16777215 %s = and i32 %s.arg, 16777215 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 266642a327a7..656ceb373e1f 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -169,7 +169,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-DAG: s_add_u32 s32, s32, 0x30000 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_swappc_b64 s[30:31], s[4:5] ; GCN: s_sub_u32 s32, s32, 0x30000 ; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir index 92e73f6db39d..8fc384a75200 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir @@ -23,3 +23,18 @@ body: | S_BRANCH %bb.1 ... + +# Check that the waitcnt pass does *not* insert a waitcnt inst after S_BUFFER_LOAD. +# WAR hazard does not apply here, because S_BUFFER_LOAD accesses invariant memory. +... +# CHECK-LABEL: name: waitcnt-no-war-wait +# CHECK: S_WAITCNT 0 +# CHECK-NEXT: S_BUFFER_LOAD_DWORD_IMM +# CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact +name: waitcnt-no-war-wait +body: | + bb.0: + renamable $sgpr8 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr0_sgpr1_sgpr2_sgpr3, 276, 0 :: (dereferenceable invariant load 4) + TBUFFER_STORE_FORMAT_X_OFFEN_exact killed renamable $vgpr0, renamable $vgpr15, renamable $sgpr4_sgpr5_sgpr6_sgpr7, renamable $sgpr9, 0, 116, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + +...