[AMDGPU] Promote alloca to vector in opt

Promote alloca to vector before SROA and loop unroll. If we manage
to eliminate allocas before unroll we may choose to unroll less.

Differential Revision: https://reviews.llvm.org/D80386
This commit is contained in:
Stanislav Mekhanoshin 2020-05-20 16:24:06 -07:00
parent be88ba09d5
commit 689e616ed0
5 changed files with 123 additions and 1 deletions

View File

@ -195,6 +195,10 @@ FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
FunctionPass *createAMDGPUPromoteAllocaToVector();
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaToVectorID;
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(
TargetMachine *TM = nullptr,

View File

@ -128,14 +128,39 @@ public:
}
};
class AMDGPUPromoteAllocaToVector : public FunctionPass {
public:
static char ID;
AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override {
return "AMDGPU Promote Alloca to vector";
}
bool handleAlloca(AllocaInst &I);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
FunctionPass::getAnalysisUsage(AU);
}
};
} // end anonymous namespace
char AMDGPUPromoteAlloca::ID = 0;
char AMDGPUPromoteAllocaToVector::ID = 0;
INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
"AMDGPU promote alloca to vector", false, false)
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
@ -982,6 +1007,43 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
return true;
}
bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
if (skipFunction(F) || DisablePromoteAllocaToVector)
return false;
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : EntryBB) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
Allocas.push_back(AI);
}
for (AllocaInst *AI : Allocas) {
if (handleAlloca(*AI))
Changed = true;
}
return Changed;
}
bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.
if (!I.isStaticAlloca() || I.isArrayAllocation())
return false;
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
Module *Mod = I.getParent()->getParent()->getParent();
return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
return new AMDGPUPromoteAlloca();
}
FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
return new AMDGPUPromoteAllocaToVector();
}

View File

@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
@ -470,7 +471,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.addExtension(
PassManagerBuilder::EP_CGSCCOptimizerLate,
[](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
[EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
@ -478,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
// This should run after inlining to have any chance of doing anything,
// and before other cleanup optimizations.
PM.add(createAMDGPULowerKernelAttributesPass());
// Promote alloca to vector before SROA and loop unroll. If we manage
// to eliminate allocas before unroll we may choose to unroll less.
if (EnableOpt)
PM.add(createAMDGPUPromoteAllocaToVector());
});
}

View File

@ -0,0 +1,47 @@
; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP
; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL
target datalayout = "A5"
; This test contains a simple loop that initializes an array declared in
; private memory. This loop would be fully unrolled if we could not SROA
; the alloca. Check that we successfully eliminate it before the unroll,
; so that we do not need to fully unroll it.
; FUNC-LABEL: @private_memory
; LOOP-NOT: alloca
; LOOP: loop.header:
; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
; FULL-UNROLL: alloca
; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, i32 addrspace(5)*
; FULL-UNROLL-NOT: br
; FUNC: store i32 %{{[^,]+}}, i32 addrspace(1)* %out
define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out, i32 %n) {
entry:
%alloca = alloca [16 x i32], addrspace(5)
br label %loop.header
loop.header:
%counter = phi i32 [0, %entry], [%inc, %loop.inc]
br label %loop.body
loop.body:
%salt = xor i32 %counter, %n
%idx = and i32 %salt, 15
%ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %idx
store i32 %counter, i32 addrspace(5)* %ptr
br label %loop.inc
loop.inc:
%inc = add i32 %counter, 1
%cmp = icmp sge i32 %counter, 255
br i1 %cmp, label %exit, label %loop.header
exit:
%gep = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %n
%load = load i32, i32 addrspace(5)* %gep
store i32 %load, i32 addrspace(1)* %out
ret void
}

View File

@ -101,6 +101,7 @@
; GCN-O1-NEXT: Infer address spaces
; GCN-O1-NEXT: AMDGPU Kernel Attributes
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: AMDGPU Promote Alloca to vector
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: SROA
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
@ -401,6 +402,7 @@
; GCN-O2-NEXT: Infer address spaces
; GCN-O2-NEXT: AMDGPU Kernel Attributes
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Promote Alloca to vector
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: SROA
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
@ -752,6 +754,7 @@
; GCN-O3-NEXT: Infer address spaces
; GCN-O3-NEXT: AMDGPU Kernel Attributes
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Promote Alloca to vector
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: SROA
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)