[AMDGPU] Check atomics aliasing in the clobbering annotation

MemorySSA considers any atomic a def to any operation it dominates
just like a barrier or fence. That is correct from memory state
perspective, but not required for the no-clobber metadata since
we are not using it for reordering. Skip such atomics during the
scan just like a barrier if it does not alias with the load.

Differential Revision: https://reviews.llvm.org/D118661
This commit is contained in:
Stanislav Mekhanoshin 2022-01-31 15:10:08 -08:00
parent 4f67a90990
commit 79606ee85c
2 changed files with 213 additions and 1 deletions

View File

@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/InstVisitor.h"
@ -31,6 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
LegacyDivergenceAnalysis *DA;
MemorySSA *MSSA;
AliasAnalysis *AA;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isEntryFunc;
@ -46,6 +48,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LegacyDivergenceAnalysis>();
AU.addRequired<MemorySSAWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.setPreservesAll();
}
@ -60,6 +63,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
@ -78,7 +82,7 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
SmallSet<MemoryAccess *, 8> Visited;
MemoryLocation Loc(MemoryLocation::get(Load));
const auto isReallyAClobber = [](MemoryDef *Def) -> bool {
const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
Instruction *DefInst = Def->getMemoryInst();
LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n');
@ -95,6 +99,17 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
}
}
// Ignore atomics not aliasing with the original load, any atomic is a
// universal MemoryDef from MSSA's point of view too, just like a fence.
const auto checkNoAlias = [this, Load](auto I) -> bool {
return I && AA->isNoAlias(I->getPointerOperand(),
Load->getPointerOperand());
};
if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
return false;
return true;
};
@ -197,6 +212,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
DA = &getAnalysis<LegacyDivergenceAnalysis>();
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);

View File

@ -434,5 +434,201 @@ entry:
ret void
}
; GCN-LABEL: {{^}}no_alias_atomic_rmw_relaxed:
; GCN: ds_add_u32
; GCN: s_load_dword s
; GCN-NOT: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @no_alias_atomic_rmw_relaxed(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic, align 4
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}no_alias_atomic_cmpxchg:
; GCN: ds_cmpst_b32
; GCN: s_load_dword s
; GCN-NOT: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
; CHECK-LABEL: @no_alias_atomic_cmpxchg(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%unused = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 %swap seq_cst monotonic
fence syncscope("workgroup") release
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}no_alias_atomic_rmw:
; GCN: ds_add_u32
; GCN: s_load_dword s
; GCN-NOT: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @no_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @no_alias_atomic_rmw(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
fence syncscope("workgroup") release
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}may_alias_atomic_cmpxchg:
; GCN: global_atomic_cmpswap
; GCN: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
; CHECK-LABEL: @may_alias_atomic_cmpxchg(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(1)* [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%unused = cmpxchg i32 addrspace(1)* %out, i32 7, i32 %swap seq_cst monotonic
fence syncscope("workgroup") release
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}may_alias_atomic_rmw:
; GCN: global_atomic_add
; GCN: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @may_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @may_alias_atomic_rmw(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(1)* [[OUT:%.*]], i32 5 seq_cst, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%unused = atomicrmw add i32 addrspace(1)* %out, i32 5 seq_cst
fence syncscope("workgroup") release
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_clobber:
; CGN: global_store_dword
; CGN: global_store_dword
; GCN: ds_add_u32
; GCN: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
store i32 1, i32 addrspace(1)* %out, align 4
store i32 2, i32 addrspace(1)* %noalias, align 4
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
fence syncscope("workgroup") release
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store:
; CGN: global_store_dword
; GCN: ds_add_u32
; GCN: s_load_dword s
; GCN-NOT: global_load_dword
; GCN: global_store_dword
define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
store i32 2, i32 addrspace(1)* %noalias, align 4
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
fence syncscope("workgroup") release
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
%ld = load i32, i32 addrspace(1)* %gep, align 4
store i32 %ld, i32 addrspace(1)* %out, align 4
ret void
}
declare void @llvm.amdgcn.s.barrier()
declare void @llvm.amdgcn.wave.barrier()