forked from OSchip/llvm-project
[AMDGPU] Constrain the AMDGPU inliner on maximum number of basic blocks in a caller function (compile time performance)
Differential revision: https://reviews.llvm.org/D62917 llvm-svn: 362789
This commit is contained in:
parent
32742d8f36
commit
cb8de55f47
|
@ -49,6 +49,12 @@ static cl::opt<unsigned>
|
|||
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
|
||||
cl::desc("Maximum alloca size to use for inline cost"));
|
||||
|
||||
// Inliner constraint to achieve reasonable compilation time
|
||||
static cl::opt<size_t>
|
||||
MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
|
||||
cl::desc("Maximum BB number allowed in a function after inlining"
|
||||
" (compile time constraint)"));
|
||||
|
||||
namespace {
|
||||
|
||||
class AMDGPUInliner : public LegacyInlinerBase {
|
||||
|
@ -208,7 +214,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
|
|||
return ACT->getAssumptionCache(F);
|
||||
};
|
||||
|
||||
return llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
|
||||
auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
|
||||
LocalParams, TTI, GetAssumptionCache, None, PSI,
|
||||
RemarksEnabled ? &ORE : nullptr);
|
||||
|
||||
if (IC && !IC.isAlways()) {
|
||||
// Single BB does not increase total BB amount, thus subtract 1
|
||||
size_t Size = Caller->size() + Callee->size() - 1;
|
||||
if (MaxBB && Size > MaxBB)
|
||||
return llvm::InlineCost::getNever("max number of bb exceeded");
|
||||
}
|
||||
return IC;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
|
||||
; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
|
||||
|
||||
define i32 @callee(i32 %x) {
|
||||
entry:
|
||||
%cc = icmp eq i32 %x, 1
|
||||
br i1 %cc, label %ret_res, label %mulx
|
||||
|
||||
mulx:
|
||||
%mul1 = mul i32 %x, %x
|
||||
%mul2 = mul i32 %mul1, %x
|
||||
%mul3 = mul i32 %mul1, %mul2
|
||||
%mul4 = mul i32 %mul3, %mul2
|
||||
%mul5 = mul i32 %mul4, %mul3
|
||||
br label %ret_res
|
||||
|
||||
ret_res:
|
||||
%r = phi i32 [ %mul5, %mulx ], [ %x, %entry ]
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; INL-LABEL: @caller
|
||||
; NOINL-LABEL: @caller
|
||||
; INL: mul i32
|
||||
; INL-NOT: call i32
|
||||
; NOINL-NOT: mul i32
|
||||
; NOINL: call i32
|
||||
|
||||
define amdgpu_kernel void @caller(i32 %x) {
|
||||
%res = call i32 @callee(i32 %x)
|
||||
store volatile i32 %res, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue