From 7f97ac94f713b6d899a514589a8517d1c1c0081d Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Mon, 18 Apr 2022 21:24:08 -0700 Subject: [PATCH] Revert "[AMDGPU] Omit unnecessary waitcnt before barriers" This reverts commit 8d0c34fd4fb66ea0d19563154a59658e4b7f35d4. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 27 ++---- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 -- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 10 +- .../back-off-barrier-subtarget-feature.ll | 97 ------------------- .../AMDGPU/waitcnt-preexisting-vscnt.mir | 4 +- llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 6 +- 6 files changed, 17 insertions(+), 134 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 28acd6ef9156..9423e471e6bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -737,12 +737,6 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < "Hardware automatically inserts waitcnt before barrier" >; -def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier", - "BackOffBarrier", - "true", - "Hardware supports backing off s_barrier if an exception occurs" ->; - def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range", "HasTrigReducedRange", "true", @@ -1031,8 +1025,7 @@ def FeatureISAVersion9_0_A : FeatureSet< FeatureMadMacF32Insts, FeatureSupportsSRAMECC, FeaturePackedTID, - FullRate64Ops, - FeatureBackOffBarrier]>; + FullRate64Ops]>; def FeatureISAVersion9_0_C : FeatureSet< [FeatureGFX9, @@ -1066,8 +1059,7 @@ def FeatureISAVersion9_4_0 : FeatureSet< FeatureSupportsSRAMECC, FeaturePackedTID, FeatureArchitectedFlatScratch, - FullRate64Ops, - FeatureBackOffBarrier]>; + FullRate64Ops]>; // TODO: Organize more features into groups. def FeatureGroup { @@ -1102,8 +1094,7 @@ def FeatureISAVersion10_1_0 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1125,8 +1116,7 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1148,8 +1138,7 @@ def FeatureISAVersion10_1_2 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_3 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1167,8 +1156,7 @@ def FeatureISAVersion10_1_3 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, @@ -1185,8 +1173,7 @@ def FeatureISAVersion10_3_0 : FeatureSet< FeatureNSAEncoding, FeatureNSAMaxSize13, FeatureWavefrontSize32, - FeatureShaderCyclesRegister, - FeatureBackOffBarrier]>; + FeatureShaderCyclesRegister]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7b4e445dfb52..f6e1d9ca3c3f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -72,7 +72,6 @@ protected: // Dynamically set bits that enable features. bool FlatForGlobal = false; bool AutoWaitcntBeforeBarrier = false; - bool BackOffBarrier = false; bool UnalignedScratchAccess = false; bool UnalignedAccessMode = false; bool HasApertureRegs = false; @@ -494,12 +493,6 @@ public: return AutoWaitcntBeforeBarrier; } - /// \returns true if the target supports backing off of s_barrier instructions - /// when an exception is raised. - bool supportsBackOffBarrier() const { - return BackOffBarrier; - } - bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 0a6f8551b431..d85606b6c19b 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1135,12 +1135,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } } - // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does - // not, we need to ensure the subtarget is capable of backing off barrier - // instructions in case there are any outstanding memory operations that may - // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. + // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 + // occurs before the instruction. Doing it here prevents any additional + // S_WAITCNTs from being emitted if the instruction was marked as + // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { + !ST->hasAutoWaitcntBeforeBarrier()) { Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll deleted file mode 100644 index 337dcfc652bd..000000000000 --- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll +++ /dev/null @@ -1,97 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s - -; Subtargets must wait for outstanding memory instructions before a barrier if -; they cannot back off of the barrier. - -define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 { -; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence: -; GFX9-NO-BACKOFF: ; %bb.0: -; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: s_barrier -; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0 -; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence: -; GFX9-BACKOFF: ; %bb.0: -; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX9-BACKOFF-NEXT: s_barrier -; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0 -; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence: -; GFX10-BACKOFF: ; %bb.0: -; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX10-BACKOFF-NEXT: s_barrier -; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0 -; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31] - %load = load i32, i32* %in - call void @llvm.amdgcn.s.barrier() - store i32 %load, i32* %out - ret void -} - -define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 { -; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence: -; GFX9-NO-BACKOFF: ; %bb.0: -; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: s_barrier -; GFX9-NO-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0 -; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence: -; GFX9-BACKOFF: ; %bb.0: -; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX9-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-BACKOFF-NEXT: s_barrier -; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0 -; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence: -; GFX10-BACKOFF: ; %bb.0: -; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-BACKOFF-NEXT: s_barrier -; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-BACKOFF-NEXT: buffer_gl0_inv -; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0 -; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31] - %load = load i32, i32* %in - fence syncscope("workgroup") release - call void @llvm.amdgcn.s.barrier() - fence syncscope("workgroup") acquire - store i32 %load, i32* %out - ret void -} - -declare void @llvm.amdgcn.s.barrier() - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir index c27363099212..5601d6931705 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir @@ -35,7 +35,7 @@ body: | ; GFX10: S_WAITCNT 0 ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec - ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GFX10: S_BARRIER ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr ; GFX10: S_WAITCNT 112 @@ -112,7 +112,7 @@ body: | ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec ; GFX10: S_WAITCNT 0 - ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GFX10: S_BARRIER ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr ; GFX10: S_WAITCNT 112 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index e78b5355c2d5..f1c5c5b0ee65 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s ; GCN-LABEL: barrier_vmcnt_global: ; GFX8: flat_load_dword @@ -42,7 +42,7 @@ bb: %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 store i32 0, i32 addrspace(1)* %tmp5, align 4 fence syncscope("singlethread") release - tail call void @llvm.amdgcn.s.barrier() + tail call void @llvm.amdgcn.s.barrier() #3 fence syncscope("singlethread") acquire %tmp6 = add nuw nsw i64 %tmp2, 4294967296 %tmp7 = lshr exact i64 %tmp6, 32 @@ -116,7 +116,7 @@ bb: %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4 store i32 0, i32* %tmp5, align 4 fence syncscope("singlethread") release - tail call void @llvm.amdgcn.s.barrier() + tail call void @llvm.amdgcn.s.barrier() #3 fence syncscope("singlethread") acquire %tmp6 = add nuw nsw i64 %tmp2, 4294967296 %tmp7 = lshr exact i64 %tmp6, 32