[AMDGPU] Boost unroll threshold for loops reading local memory

This is less important than increase threshold for private memory, but still brings performance improvements in a wide range of tests. Unrolling more for local memory serves three purposes: it allows to combine ds operations if offset becomes static, saves registers used for offsets in case of static offsets, and allows better lds latency hiding. Differential Revision: https://reviews.llvm.org/D31412 llvm-svn: 298948
2017-03-28 22:13:51 +00:00 · 2017-03-28 22:13:51 +00:00 · baf31ac7c8
parent 99a84b163f
commit baf31ac7c8
2 changed files with 104 additions and 32 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -34,6 +34,11 @@ static cl::opt<unsigned> UnrollThresholdPrivate(
  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
  cl::init(2000), cl::Hidden);

+static cl::opt<unsigned> UnrollThresholdLocal(
+  "amdgpu-unroll-threshold-local",
+  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
+  cl::init(1000), cl::Hidden);
+
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                            TTI::UnrollingPreferences &UP) {
  UP.Threshold = 300; // Twice the default.
@ -44,50 +49,87 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,

  // Maximum alloca size than can fit registers. Reserve 16 registers.
  const unsigned MaxAlloca = (256 - 16) * 4;
+  unsigned ThresholdPrivate = UnrollThresholdPrivate;
+  unsigned ThresholdLocal = UnrollThresholdLocal;
+  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+  AMDGPUAS ASST = ST->getAMDGPUAS();
  for (const BasicBlock *BB : L->getBlocks()) {
    const DataLayout &DL = BB->getModule()->getDataLayout();
+    unsigned LocalGEPsSeen = 0;
+
    for (const Instruction &I : *BB) {
      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
-      if (!GEP || GEP->getAddressSpace() != ST->getAMDGPUAS().PRIVATE_ADDRESS)
+      if (!GEP)
        continue;

-      const Value *Ptr = GEP->getPointerOperand();
-      const AllocaInst *Alloca =
-          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
-      if (Alloca && Alloca->isStaticAlloca()) {
+      unsigned AS = GEP->getAddressSpace();
+      unsigned Threshold = 0;
+      if (AS == ASST.PRIVATE_ADDRESS)
+        Threshold = ThresholdPrivate;
+      else if (AS == ASST.LOCAL_ADDRESS)
+        Threshold = ThresholdLocal;
+      else
+        continue;
+
+      if (UP.Threshold >= Threshold)
+        continue;
+
+      if (AS == ASST.PRIVATE_ADDRESS) {
+        const Value *Ptr = GEP->getPointerOperand();
+        const AllocaInst *Alloca =
+            dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+        if (!Alloca || !Alloca->isStaticAlloca())
+          continue;
        Type *Ty = Alloca->getAllocatedType();
        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
        if (AllocaSize > MaxAlloca)
          continue;
+      } else if (AS == ASST.LOCAL_ADDRESS) {
+        LocalGEPsSeen++;
+        // Inhibit unroll for local memory if we have seen addressing not to
+        // a variable, most likely we will be unable to combine it.
+        // Do not unroll too deep inner loops for local memory to give a chance
+        // to unroll an outer loop for a more important reason.
+        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
+            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
+             !isa<Argument>(GEP->getPointerOperand())))
+          continue;
+      }

-        // Check if GEP depends on a value defined by this loop itself.
-        bool HasLoopDef = false;
-        for (const Value *Op : GEP->operands()) {
-          const Instruction *Inst = dyn_cast<Instruction>(Op);
-          if (!Inst || L->isLoopInvariant(Op))
-            continue;
-          if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
-               return SubLoop->contains(Inst); }))
-            continue;
-          HasLoopDef = true;
-          break;
-        }
-        if (!HasLoopDef)
+      // Check if GEP depends on a value defined by this loop itself.
+      bool HasLoopDef = false;
+      for (const Value *Op : GEP->operands()) {
+        const Instruction *Inst = dyn_cast<Instruction>(Op);
+        if (!Inst || L->isLoopInvariant(Op))
          continue;

-        // We want to do whatever we can to limit the number of alloca
-        // instructions that make it through to the code generator.  allocas
-        // require us to use indirect addressing, which is slow and prone to
-        // compiler bugs.  If this loop does an address calculation on an
-        // alloca ptr, then we want to use a higher than normal loop unroll
-        // threshold. This will give SROA a better chance to eliminate these
-        // allocas.
-        //
-        // Don't use the maximum allowed value here as it will make some
-        // programs way too big.
-        UP.Threshold = UnrollThresholdPrivate;
-        return;
+        if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+             return SubLoop->contains(Inst); }))
+          continue;
+        HasLoopDef = true;
+        break;
      }
+      if (!HasLoopDef)
+        continue;
+
+      // We want to do whatever we can to limit the number of alloca
+      // instructions that make it through to the code generator.  allocas
+      // require us to use indirect addressing, which is slow and prone to
+      // compiler bugs.  If this loop does an address calculation on an
+      // alloca ptr, then we want to use a higher than normal loop unroll
+      // threshold. This will give SROA a better chance to eliminate these
+      // allocas.
+      //
+      // We also want to have more unrolling for local memory to let ds
+      // instructions with different offsets combine.
+      //
+      // Don't use the maximum allowed value here as it will make some
+      // programs way too big.
+      UP.Threshold = Threshold;
+      DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
+                   << *L << " due to " << *GEP << '\n');
+      if (UP.Threshold == MaxBoost)
+        return;
    }
  }
 }
--- a/llvm/test/CodeGen/AMDGPU/unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/unroll.ll
@ -6,10 +6,10 @@
 ; private memory.  We want to make sure these kinds of loops are always
 ; unrolled, because private memory is slow.

-; CHECK-LABEL: @test
+; CHECK-LABEL: @private_memory
 ; CHECK-NOT: alloca
 ; CHECK: store i32 5, i32 addrspace(1)* %out
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) {
 entry:
  %0 = alloca [32 x i32]
  br label %loop.header
@ -34,3 +34,33 @@ exit:
  store i32 %3, i32 addrspace(1)* %out
  ret void
 }
+
+; Check that loop is unrolled for local memory references
+
+; CHECK-LABEL: @local_memory
+; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128
+; CHECK-NEXT: store
+; CHECK-NEXT: ret
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
+  %val = load i32, i32 addrspace(3)* %ptr_lds
+  %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
+  store i32 %val, i32 addrspace(1)* %ptr_out
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %cond = icmp sge i32 %counter, 128
+  br i1 %cond, label  %exit, label %loop.header
+
+exit:
+  ret void
+}