forked from OSchip/llvm-project
[AMDGPU] Tune inlining parameters for AMDGPU target
Summary: Since the target has no significant advantage of vectorization, vector instructions bous threshold bonus should be optional. amdgpu-inline-arg-alloca-cost parameter default value and the target InliningThresholdMultiplier value tuned then respectively. Reviewers: arsenm, rampitec Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, eraman, hiraditya, haicheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64642 llvm-svn: 366348
This commit is contained in:
parent
3fce6b5da1
commit
d912a9ba9b
|
@ -263,6 +263,18 @@ public:
|
|||
/// individual classes of instructions would be better.
|
||||
unsigned getInliningThresholdMultiplier() const;
|
||||
|
||||
/// \returns Vector bonus in percent.
|
||||
///
|
||||
/// Vector bonuses: We want to more aggressively inline vector-dense kernels
|
||||
/// and apply this bonus based on the percentage of vector instructions. A
|
||||
/// bonus is applied if the vector instructions exceed 50% and half that amount
|
||||
/// is applied if it exceeds 10%. Note that these bonuses are some what
|
||||
/// arbitrary and evolved over time by accident as much as because they are
|
||||
/// principled bonuses.
|
||||
/// FIXME: It would be nice to base the bonus values on something more
|
||||
/// scientific. A target may has no bonus on vector instructions.
|
||||
int getInlinerVectorBonusPercent() const;
|
||||
|
||||
/// Estimate the cost of an intrinsic when lowered.
|
||||
///
|
||||
/// Mirrors the \c getCallCost method but uses an intrinsic identifier.
|
||||
|
@ -1128,6 +1140,7 @@ public:
|
|||
virtual int getCallCost(const Function *F,
|
||||
ArrayRef<const Value *> Arguments, const User *U) = 0;
|
||||
virtual unsigned getInliningThresholdMultiplier() = 0;
|
||||
virtual int getInlinerVectorBonusPercent() = 0;
|
||||
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> ParamTys, const User *U) = 0;
|
||||
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
|
||||
|
@ -1351,6 +1364,9 @@ public:
|
|||
unsigned getInliningThresholdMultiplier() override {
|
||||
return Impl.getInliningThresholdMultiplier();
|
||||
}
|
||||
int getInlinerVectorBonusPercent() override {
|
||||
return Impl.getInlinerVectorBonusPercent();
|
||||
}
|
||||
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> ParamTys, const User *U = nullptr) override {
|
||||
return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U);
|
||||
|
|
|
@ -140,6 +140,8 @@ public:
|
|||
|
||||
unsigned getInliningThresholdMultiplier() { return 1; }
|
||||
|
||||
int getInlinerVectorBonusPercent() { return 150; }
|
||||
|
||||
unsigned getMemcpyCost(const Instruction *I) {
|
||||
return TTI::TCC_Expensive;
|
||||
}
|
||||
|
|
|
@ -427,6 +427,8 @@ public:
|
|||
|
||||
unsigned getInliningThresholdMultiplier() { return 1; }
|
||||
|
||||
int getInlinerVectorBonusPercent() { return 150; }
|
||||
|
||||
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
TTI::UnrollingPreferences &UP) {
|
||||
// This unrolling functionality is target independent, but to provide some
|
||||
|
|
|
@ -880,15 +880,6 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
|
|||
// basic block at the given callsite context. This is speculatively applied
|
||||
// and withdrawn if more than one basic block is seen.
|
||||
//
|
||||
// Vector bonuses: We want to more aggressively inline vector-dense kernels
|
||||
// and apply this bonus based on the percentage of vector instructions. A
|
||||
// bonus is applied if the vector instructions exceed 50% and half that amount
|
||||
// is applied if it exceeds 10%. Note that these bonuses are some what
|
||||
// arbitrary and evolved over time by accident as much as because they are
|
||||
// principled bonuses.
|
||||
// FIXME: It would be nice to base the bonus values on something more
|
||||
// scientific.
|
||||
//
|
||||
// LstCallToStaticBonus: This large bonus is applied to ensure the inlining
|
||||
// of the last call to a static function as inlining such functions is
|
||||
// guaranteed to reduce code size.
|
||||
|
@ -896,7 +887,7 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
|
|||
// These bonus percentages may be set to 0 based on properties of the caller
|
||||
// and the callsite.
|
||||
int SingleBBBonusPercent = 50;
|
||||
int VectorBonusPercent = 150;
|
||||
int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
|
||||
int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
|
||||
|
||||
// Lambda to set all the above bonus and bonus percentages to 0.
|
||||
|
|
|
@ -176,6 +176,10 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
|
|||
return TTIImpl->getInliningThresholdMultiplier();
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getInlinerVectorBonusPercent() const {
|
||||
return TTIImpl->getInlinerVectorBonusPercent();
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
|
||||
ArrayRef<const Value *> Operands) const {
|
||||
return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
|
||||
|
|
|
@ -39,7 +39,7 @@ using namespace llvm;
|
|||
#define DEBUG_TYPE "inline"
|
||||
|
||||
static cl::opt<int>
|
||||
ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
|
||||
ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
|
||||
cl::desc("Cost of alloca argument"));
|
||||
|
||||
// If the amount of scratch memory to eliminate exceeds our ability to allocate
|
||||
|
|
|
@ -191,7 +191,9 @@ public:
|
|||
bool areInlineCompatible(const Function *Caller,
|
||||
const Function *Callee) const;
|
||||
|
||||
unsigned getInliningThresholdMultiplier() { return 9; }
|
||||
unsigned getInliningThresholdMultiplier() { return 7; }
|
||||
|
||||
int getInlinerVectorBonusPercent() { return 0; }
|
||||
|
||||
int getArithmeticReductionCost(unsigned Opcode,
|
||||
Type *Ty,
|
||||
|
|
|
@ -28,15 +28,8 @@ if.end: ; preds = %if.then, %entry
|
|||
define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) {
|
||||
entry:
|
||||
%tmp1 = load float, float addrspace(5)* %p1, align 4
|
||||
%cmp = fcmp ogt float %tmp1, 1.000000e+00
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%div = fdiv float 2.000000e+00, %tmp1
|
||||
store float %div, float addrspace(5)* %p2, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s
|
||||
|
||||
define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) {
|
||||
entry:
|
||||
%div.1 = udiv <16 x i32> %x, %y
|
||||
%div.2 = udiv <16 x i32> %div.1, %y
|
||||
%div.3 = udiv <16 x i32> %div.2, %y
|
||||
%div.4 = udiv <16 x i32> %div.3, %y
|
||||
%div.5 = udiv <16 x i32> %div.4, %y
|
||||
%div.6 = udiv <16 x i32> %div.5, %y
|
||||
%div.7 = udiv <16 x i32> %div.6, %y
|
||||
%div.8 = udiv <16 x i32> %div.7, %y
|
||||
%div.9 = udiv <16 x i32> %div.8, %y
|
||||
%div.10 = udiv <16 x i32> %div.9, %y
|
||||
%div.11 = udiv <16 x i32> %div.10, %y
|
||||
%div.12 = udiv <16 x i32> %div.11, %y
|
||||
ret <16 x i32> %div.12
|
||||
}
|
||||
|
||||
; CHECK-LABEL: define amdgpu_kernel void @caller_vecbonus
|
||||
; CHECK-NOT: udiv
|
||||
; CHECK: tail call <16 x i32> @div_vecbonus
|
||||
; CHECK: ret void
|
||||
define amdgpu_kernel void @caller_vecbonus(<16 x i32> addrspace(1)* nocapture %x, <16 x i32> addrspace(1)* nocapture readonly %y) {
|
||||
entry:
|
||||
%tmp = load <16 x i32>, <16 x i32> addrspace(1)* %x
|
||||
%tmp1 = load <16 x i32>, <16 x i32> addrspace(1)* %y
|
||||
%div.i = tail call <16 x i32> @div_vecbonus(<16 x i32> %tmp, <16 x i32> %tmp1)
|
||||
store <16 x i32> %div.i, <16 x i32> addrspace(1)* %x
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue