forked from OSchip/llvm-project
[AMDGPU] Limit promote alloca to vector with VGPR budget
Allow only up to 1/4 of available VGPRs for the vectorization of any given alloca. Differential Revision: https://reviews.llvm.org/D82990
This commit is contained in:
parent
5c37b2a5ee
commit
54e2dc7537
|
@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
|
|||
cl::desc("Disable promote alloca to LDS"),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<unsigned> PromoteAllocaToVectorLimit(
|
||||
"amdgpu-promote-alloca-to-vector-limit",
|
||||
cl::desc("Maximum byte size to consider promote alloca to vector"),
|
||||
cl::init(0));
|
||||
|
||||
// FIXME: This can create globals so should be a module pass.
|
||||
class AMDGPUPromoteAlloca : public FunctionPass {
|
||||
private:
|
||||
|
@ -86,6 +91,7 @@ private:
|
|||
// FIXME: This should be per-kernel.
|
||||
uint32_t LocalMemLimit = 0;
|
||||
uint32_t CurrentLocalMemUsage = 0;
|
||||
unsigned MaxVGPRs;
|
||||
|
||||
bool IsAMDGCN = false;
|
||||
bool IsAMDHSA = false;
|
||||
|
@ -129,6 +135,9 @@ public:
|
|||
};
|
||||
|
||||
class AMDGPUPromoteAllocaToVector : public FunctionPass {
|
||||
private:
|
||||
unsigned MaxVGPRs;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
|
@ -186,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
|||
if (!ST.isPromoteAllocaEnabled())
|
||||
return false;
|
||||
|
||||
if (IsAMDGCN) {
|
||||
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
|
||||
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
|
||||
} else {
|
||||
MaxVGPRs = 128;
|
||||
}
|
||||
|
||||
bool SufficientLDS = hasSufficientLocalMem(F);
|
||||
bool Changed = false;
|
||||
BasicBlock &EntryBB = *F.begin();
|
||||
|
@ -409,7 +425,8 @@ static bool canVectorizeInst(Instruction *Inst, User *User,
|
|||
}
|
||||
}
|
||||
|
||||
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
|
||||
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
|
||||
unsigned MaxVGPRs) {
|
||||
|
||||
if (DisablePromoteAllocaToVector) {
|
||||
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
|
||||
|
@ -424,6 +441,16 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
|
|||
VectorTy = arrayTypeToVecType(ArrayTy);
|
||||
}
|
||||
|
||||
// Use up to 1/4 of available register budget for vectorization.
|
||||
unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
|
||||
: (MaxVGPRs * 32);
|
||||
|
||||
if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
|
||||
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
|
||||
<< MaxVGPRs << " registers available\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
|
||||
|
||||
// FIXME: There is no reason why we can't support larger arrays, we
|
||||
|
@ -806,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
|
|||
|
||||
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
||||
|
||||
if (tryPromoteAllocaToVector(&I, DL))
|
||||
if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
|
||||
return true; // Promoted to vector.
|
||||
|
||||
if (DisablePromoteAllocaToLDS)
|
||||
|
@ -1016,6 +1043,23 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
|
|||
if (skipFunction(F) || DisablePromoteAllocaToVector)
|
||||
return false;
|
||||
|
||||
const TargetMachine *TM;
|
||||
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
|
||||
TM = &TPC->getTM<TargetMachine>();
|
||||
else
|
||||
return false;
|
||||
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
|
||||
if (!ST.isPromoteAllocaEnabled())
|
||||
return false;
|
||||
|
||||
if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
|
||||
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
|
||||
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
|
||||
} else {
|
||||
MaxVGPRs = 128;
|
||||
}
|
||||
|
||||
bool Changed = false;
|
||||
BasicBlock &EntryBB = *F.begin();
|
||||
|
||||
|
@ -1042,7 +1086,7 @@ bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
|
|||
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
||||
|
||||
Module *Mod = I.getParent()->getParent()->getParent();
|
||||
return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
|
||||
return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
|
||||
|
||||
target datalayout = "A5"
|
||||
|
||||
; OPT-LABEL: @alloca_8xi64_max1024(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: <8 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <8 x i64>
|
||||
define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
|
||||
entry:
|
||||
%tmp = alloca [8 x i64], addrspace(5)
|
||||
%x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_9xi64_max1024(
|
||||
; OPT: alloca [9 x i64]
|
||||
; OPT-NOT: <9 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i64>
|
||||
define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i64], addrspace(5)
|
||||
%x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_16xi64_max512(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: <16 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <16 x i64>
|
||||
define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
|
||||
entry:
|
||||
%tmp = alloca [16 x i64], addrspace(5)
|
||||
%x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_17xi64_max512(
|
||||
; OPT: alloca [17 x i64]
|
||||
; OPT-NOT: <17 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <17 x i64>
|
||||
define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
|
||||
entry:
|
||||
%tmp = alloca [17 x i64], addrspace(5)
|
||||
%x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_9xi128_max512(
|
||||
; OPT: alloca [9 x i128]
|
||||
; OPT-NOT: <9 x i128>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i128>
|
||||
define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i128], addrspace(5)
|
||||
%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i128 0, i128 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, i128 addrspace(5)* %tmp1
|
||||
store i128 %tmp2, i128 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_9xi128_max256(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: <9 x i128>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i128>
|
||||
define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i128], addrspace(5)
|
||||
%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i128 0, i128 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, i128 addrspace(5)* %tmp1
|
||||
store i128 %tmp2, i128 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_16xi128_max256(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: <16 x i128>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <16 x i128>
|
||||
define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [16 x i128], addrspace(5)
|
||||
%x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i128 0, i128 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, i128 addrspace(5)* %tmp1
|
||||
store i128 %tmp2, i128 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @alloca_9xi256_max256(
|
||||
; OPT: alloca [9 x i256]
|
||||
; OPT-NOT: <9 x i256>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i256>
|
||||
define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i256], addrspace(5)
|
||||
%x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i256 0, i256 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i256, i256 addrspace(5)* %tmp1
|
||||
store i256 %tmp2, i256 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
|
||||
attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
|
||||
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
|
Loading…
Reference in New Issue