[AMDGPU] Boost unroll threshold for loops reading local memory

This is less important than increase threshold for private memory,
but still brings performance improvements in a wide range of tests.
Unrolling more for local memory serves three purposes: it allows
to combine ds operations if offset becomes static, saves registers
used for offsets in case of static offsets, and allows better lds
latency hiding.

Differential Revision: https://reviews.llvm.org/D31412

llvm-svn: 298948
This commit is contained in:
Stanislav Mekhanoshin 2017-03-28 22:13:51 +00:00
parent 99a84b163f
commit baf31ac7c8
2 changed files with 104 additions and 32 deletions

View File

@ -34,6 +34,11 @@ static cl::opt<unsigned> UnrollThresholdPrivate(
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
cl::init(2000), cl::Hidden);
static cl::opt<unsigned> UnrollThresholdLocal(
"amdgpu-unroll-threshold-local",
cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
cl::init(1000), cl::Hidden);
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
TTI::UnrollingPreferences &UP) {
UP.Threshold = 300; // Twice the default.
@ -44,50 +49,87 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
// Maximum alloca size than can fit registers. Reserve 16 registers.
const unsigned MaxAlloca = (256 - 16) * 4;
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
AMDGPUAS ASST = ST->getAMDGPUAS();
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;
for (const Instruction &I : *BB) {
const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
if (!GEP || GEP->getAddressSpace() != ST->getAMDGPUAS().PRIVATE_ADDRESS)
if (!GEP)
continue;
const Value *Ptr = GEP->getPointerOperand();
const AllocaInst *Alloca =
dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
if (Alloca && Alloca->isStaticAlloca()) {
unsigned AS = GEP->getAddressSpace();
unsigned Threshold = 0;
if (AS == ASST.PRIVATE_ADDRESS)
Threshold = ThresholdPrivate;
else if (AS == ASST.LOCAL_ADDRESS)
Threshold = ThresholdLocal;
else
continue;
if (UP.Threshold >= Threshold)
continue;
if (AS == ASST.PRIVATE_ADDRESS) {
const Value *Ptr = GEP->getPointerOperand();
const AllocaInst *Alloca =
dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
if (!Alloca || !Alloca->isStaticAlloca())
continue;
Type *Ty = Alloca->getAllocatedType();
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;
} else if (AS == ASST.LOCAL_ADDRESS) {
LocalGEPsSeen++;
// Inhibit unroll for local memory if we have seen addressing not to
// a variable, most likely we will be unable to combine it.
// Do not unroll too deep inner loops for local memory to give a chance
// to unroll an outer loop for a more important reason.
if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
(!isa<GlobalVariable>(GEP->getPointerOperand()) &&
!isa<Argument>(GEP->getPointerOperand())))
continue;
}
// Check if GEP depends on a value defined by this loop itself.
bool HasLoopDef = false;
for (const Value *Op : GEP->operands()) {
const Instruction *Inst = dyn_cast<Instruction>(Op);
if (!Inst || L->isLoopInvariant(Op))
continue;
if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
return SubLoop->contains(Inst); }))
continue;
HasLoopDef = true;
break;
}
if (!HasLoopDef)
// Check if GEP depends on a value defined by this loop itself.
bool HasLoopDef = false;
for (const Value *Op : GEP->operands()) {
const Instruction *Inst = dyn_cast<Instruction>(Op);
if (!Inst || L->isLoopInvariant(Op))
continue;
// We want to do whatever we can to limit the number of alloca
// instructions that make it through to the code generator. allocas
// require us to use indirect addressing, which is slow and prone to
// compiler bugs. If this loop does an address calculation on an
// alloca ptr, then we want to use a higher than normal loop unroll
// threshold. This will give SROA a better chance to eliminate these
// allocas.
//
// Don't use the maximum allowed value here as it will make some
// programs way too big.
UP.Threshold = UnrollThresholdPrivate;
return;
if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
return SubLoop->contains(Inst); }))
continue;
HasLoopDef = true;
break;
}
if (!HasLoopDef)
continue;
// We want to do whatever we can to limit the number of alloca
// instructions that make it through to the code generator. allocas
// require us to use indirect addressing, which is slow and prone to
// compiler bugs. If this loop does an address calculation on an
// alloca ptr, then we want to use a higher than normal loop unroll
// threshold. This will give SROA a better chance to eliminate these
// allocas.
//
// We also want to have more unrolling for local memory to let ds
// instructions with different offsets combine.
//
// Don't use the maximum allowed value here as it will make some
// programs way too big.
UP.Threshold = Threshold;
DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
<< *L << " due to " << *GEP << '\n');
if (UP.Threshold == MaxBoost)
return;
}
}
}

View File

@ -6,10 +6,10 @@
; private memory. We want to make sure these kinds of loops are always
; unrolled, because private memory is slow.
; CHECK-LABEL: @test
; CHECK-LABEL: @private_memory
; CHECK-NOT: alloca
; CHECK: store i32 5, i32 addrspace(1)* %out
define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) {
entry:
%0 = alloca [32 x i32]
br label %loop.header
@ -34,3 +34,33 @@ exit:
store i32 %3, i32 addrspace(1)* %out
ret void
}
; Check that loop is unrolled for local memory references
; CHECK-LABEL: @local_memory
; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128
; CHECK-NEXT: store
; CHECK-NEXT: ret
define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) {
entry:
br label %loop.header
loop.header:
%counter = phi i32 [0, %entry], [%inc, %loop.inc]
br label %loop.body
loop.body:
%ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
%val = load i32, i32 addrspace(3)* %ptr_lds
%ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
store i32 %val, i32 addrspace(1)* %ptr_out
br label %loop.inc
loop.inc:
%inc = add i32 %counter, 1
%cond = icmp sge i32 %counter, 128
br i1 %cond, label %exit, label %loop.header
exit:
ret void
}