[AMDGPU] Limit promote alloca to vector with VGPR budget

Allow only up to 1/4 of available VGPRs for the vectorization
of any given alloca.

Differential Revision: https://reviews.llvm.org/D82990
This commit is contained in:
Stanislav Mekhanoshin 2020-07-01 12:08:22 -07:00
parent 5c37b2a5ee
commit 54e2dc7537
2 changed files with 183 additions and 3 deletions

View File

@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
cl::desc("Disable promote alloca to LDS"),
cl::init(false));
static cl::opt<unsigned> PromoteAllocaToVectorLimit(
"amdgpu-promote-alloca-to-vector-limit",
cl::desc("Maximum byte size to consider promote alloca to vector"),
cl::init(0));
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
@ -86,6 +91,7 @@ private:
// FIXME: This should be per-kernel.
uint32_t LocalMemLimit = 0;
uint32_t CurrentLocalMemUsage = 0;
unsigned MaxVGPRs;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@ -129,6 +135,9 @@ public:
};
class AMDGPUPromoteAllocaToVector : public FunctionPass {
private:
unsigned MaxVGPRs;
public:
static char ID;
@ -186,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
if (!ST.isPromoteAllocaEnabled())
return false;
if (IsAMDGCN) {
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
} else {
MaxVGPRs = 128;
}
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@ -409,7 +425,8 @@ static bool canVectorizeInst(Instruction *Inst, User *User,
}
}
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
unsigned MaxVGPRs) {
if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
@ -424,6 +441,16 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
VectorTy = arrayTypeToVecType(ArrayTy);
}
// Use up to 1/4 of available register budget for vectorization.
unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32);
if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
<< MaxVGPRs << " registers available\n");
return false;
}
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
// FIXME: There is no reason why we can't support larger arrays, we
@ -806,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
if (tryPromoteAllocaToVector(&I, DL))
if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
return true; // Promoted to vector.
if (DisablePromoteAllocaToLDS)
@ -1016,6 +1043,23 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
if (skipFunction(F) || DisablePromoteAllocaToVector)
return false;
const TargetMachine *TM;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
TM = &TPC->getTM<TargetMachine>();
else
return false;
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
if (!ST.isPromoteAllocaEnabled())
return false;
if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
} else {
MaxVGPRs = 128;
}
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@ -1042,7 +1086,7 @@ bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
Module *Mod = I.getParent()->getParent()->getParent();
return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {

View File

@ -0,0 +1,136 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
target datalayout = "A5"
; OPT-LABEL: @alloca_8xi64_max1024(
; OPT-NOT: alloca
; OPT: <8 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <8 x i64>
define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
entry:
%tmp = alloca [8 x i64], addrspace(5)
%x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
store i64 0, i64 addrspace(5)* %x
%tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i64, i64 addrspace(5)* %tmp1
store i64 %tmp2, i64 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_9xi64_max1024(
; OPT: alloca [9 x i64]
; OPT-NOT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
%x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
store i64 0, i64 addrspace(5)* %x
%tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i64, i64 addrspace(5)* %tmp1
store i64 %tmp2, i64 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_16xi64_max512(
; OPT-NOT: alloca
; OPT: <16 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <16 x i64>
define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
entry:
%tmp = alloca [16 x i64], addrspace(5)
%x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
store i64 0, i64 addrspace(5)* %x
%tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i64, i64 addrspace(5)* %tmp1
store i64 %tmp2, i64 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_17xi64_max512(
; OPT: alloca [17 x i64]
; OPT-NOT: <17 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <17 x i64>
define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
entry:
%tmp = alloca [17 x i64], addrspace(5)
%x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
store i64 0, i64 addrspace(5)* %x
%tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i64, i64 addrspace(5)* %tmp1
store i64 %tmp2, i64 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_9xi128_max512(
; OPT: alloca [9 x i128]
; OPT-NOT: <9 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i128>
define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
entry:
%tmp = alloca [9 x i128], addrspace(5)
%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
store i128 0, i128 addrspace(5)* %x
%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i128, i128 addrspace(5)* %tmp1
store i128 %tmp2, i128 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_9xi128_max256(
; OPT-NOT: alloca
; OPT: <9 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i128>
define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i128], addrspace(5)
%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
store i128 0, i128 addrspace(5)* %x
%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i128, i128 addrspace(5)* %tmp1
store i128 %tmp2, i128 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_16xi128_max256(
; OPT-NOT: alloca
; OPT: <16 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <16 x i128>
define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
entry:
%tmp = alloca [16 x i128], addrspace(5)
%x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
store i128 0, i128 addrspace(5)* %x
%tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i128, i128 addrspace(5)* %tmp1
store i128 %tmp2, i128 addrspace(1)* %out
ret void
}
; OPT-LABEL: @alloca_9xi256_max256(
; OPT: alloca [9 x i256]
; OPT-NOT: <9 x i256>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i256>
define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i256], addrspace(5)
%x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
store i256 0, i256 addrspace(5)* %x
%tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
%tmp2 = load i256, i256 addrspace(5)* %tmp1
store i256 %tmp2, i256 addrspace(1)* %out
ret void
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }