[AMDGPU] Limit promote alloca to vector with VGPR budget

Allow only up to 1/4 of available VGPRs for the vectorization of any given alloca. Differential Revision: https://reviews.llvm.org/D82990
2020-07-01 12:08:22 -07:00 · 2020-07-01 12:08:22 -07:00 · 54e2dc7537
parent 5c37b2a5ee
commit 54e2dc7537
2 changed files with 183 additions and 3 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
  cl::desc("Disable promote alloca to LDS"),
  cl::init(false));

+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+  "amdgpu-promote-alloca-to-vector-limit",
+  cl::desc("Maximum byte size to consider promote alloca to vector"),
+  cl::init(0));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@ -86,6 +91,7 @@ private:
  // FIXME: This should be per-kernel.
  uint32_t LocalMemLimit = 0;
  uint32_t CurrentLocalMemUsage = 0;
+  unsigned MaxVGPRs;

  bool IsAMDGCN = false;
  bool IsAMDHSA = false;
@ -129,6 +135,9 @@ public:
 };

 class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+  unsigned MaxVGPRs;
+
 public:
  static char ID;

@ -186,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
  if (!ST.isPromoteAllocaEnabled())
    return false;

+  if (IsAMDGCN) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
  bool SufficientLDS = hasSufficientLocalMem(F);
  bool Changed = false;
  BasicBlock &EntryBB = *F.begin();
@ -409,7 +425,8 @@ static bool canVectorizeInst(Instruction *Inst, User *User,
  }
 }

-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+                                     unsigned MaxVGPRs) {

  if (DisablePromoteAllocaToVector) {
    LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
@ -424,6 +441,16 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
      VectorTy = arrayTypeToVecType(ArrayTy);
  }

+  // Use up to 1/4 of available register budget for vectorization.
+  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                              : (MaxVGPRs * 32);
+
+  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
+                      << MaxVGPRs << " registers available\n");
+    return false;
+  }
+
  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");

  // FIXME: There is no reason why we can't support larger arrays, we
@ -806,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {

  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');

-  if (tryPromoteAllocaToVector(&I, DL))
+  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
    return true; // Promoted to vector.

  if (DisablePromoteAllocaToLDS)
@ -1016,6 +1043,23 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
  if (skipFunction(F) || DisablePromoteAllocaToVector)
    return false;

+  const TargetMachine *TM;
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    TM = &TPC->getTM<TargetMachine>();
+  else
+    return false;
+
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
  bool Changed = false;
  BasicBlock &EntryBB = *F.begin();

@ -1042,7 +1086,7 @@ bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');

  Module *Mod = I.getParent()->getParent()->getParent();
-  return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
 }

 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@ -0,0 +1,136 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+
+target datalayout = "A5"
+
+; OPT-LABEL: @alloca_8xi64_max1024(
+; OPT-NOT: alloca
+; OPT: <8 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <8 x i64>
+define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %tmp = alloca [8 x i64], addrspace(5)
+  %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi64_max1024(
+; OPT: alloca [9 x i64]
+; OPT-NOT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_16xi64_max512(
+; OPT-NOT: alloca
+; OPT: <16 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <16 x i64>
+define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %tmp = alloca [16 x i64], addrspace(5)
+  %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_17xi64_max512(
+; OPT: alloca [17 x i64]
+; OPT-NOT: <17 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <17 x i64>
+define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %tmp = alloca [17 x i64], addrspace(5)
+  %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi128_max512(
+; OPT: alloca [9 x i128]
+; OPT-NOT: <9 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i128>
+define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %tmp = alloca [9 x i128], addrspace(5)
+  %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
+  store i128 0, i128 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i128, i128 addrspace(5)* %tmp1
+  store i128 %tmp2, i128 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi128_max256(
+; OPT-NOT: alloca
+; OPT: <9 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i128>
+define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i128], addrspace(5)
+  %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
+  store i128 0, i128 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i128, i128 addrspace(5)* %tmp1
+  store i128 %tmp2, i128 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_16xi128_max256(
+; OPT-NOT: alloca
+; OPT: <16 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <16 x i128>
+define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [16 x i128], addrspace(5)
+  %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
+  store i128 0, i128 addrspace(5)* %x
+  %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i128, i128 addrspace(5)* %tmp1
+  store i128 %tmp2, i128 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi256_max256(
+; OPT: alloca [9 x i256]
+; OPT-NOT: <9 x i256>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i256>
+define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i256], addrspace(5)
+  %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
+  store i256 0, i256 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i256, i256 addrspace(5)* %tmp1
+  store i256 %tmp2, i256 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }