diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index af1a12dc18de..7574b811bc1c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -263,6 +263,18 @@ public: /// individual classes of instructions would be better. unsigned getInliningThresholdMultiplier() const; + /// \returns Vector bonus in percent. + /// + /// Vector bonuses: We want to more aggressively inline vector-dense kernels + /// and apply this bonus based on the percentage of vector instructions. A + /// bonus is applied if the vector instructions exceed 50% and half that amount + /// is applied if it exceeds 10%. Note that these bonuses are some what + /// arbitrary and evolved over time by accident as much as because they are + /// principled bonuses. + /// FIXME: It would be nice to base the bonus values on something more + /// scientific. A target may has no bonus on vector instructions. + int getInlinerVectorBonusPercent() const; + /// Estimate the cost of an intrinsic when lowered. /// /// Mirrors the \c getCallCost method but uses an intrinsic identifier. @@ -1128,6 +1140,7 @@ public: virtual int getCallCost(const Function *F, ArrayRef Arguments, const User *U) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; + virtual int getInlinerVectorBonusPercent() = 0; virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef ParamTys, const User *U) = 0; virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, @@ -1351,6 +1364,9 @@ public: unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); } + int getInlinerVectorBonusPercent() override { + return Impl.getInlinerVectorBonusPercent(); + } int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef ParamTys, const User *U = nullptr) override { return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index a9383e795fca..b99e1eb9adf0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -140,6 +140,8 @@ public: unsigned getInliningThresholdMultiplier() { return 1; } + int getInlinerVectorBonusPercent() { return 150; } + unsigned getMemcpyCost(const Instruction *I) { return TTI::TCC_Expensive; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index c2d050d9ec85..70bf670fdf0b 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -427,6 +427,8 @@ public: unsigned getInliningThresholdMultiplier() { return 1; } + int getInlinerVectorBonusPercent() { return 150; } + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // This unrolling functionality is target independent, but to provide some diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 3cb56f8cccf5..0dec146e0465 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -880,15 +880,6 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // basic block at the given callsite context. This is speculatively applied // and withdrawn if more than one basic block is seen. // - // Vector bonuses: We want to more aggressively inline vector-dense kernels - // and apply this bonus based on the percentage of vector instructions. A - // bonus is applied if the vector instructions exceed 50% and half that amount - // is applied if it exceeds 10%. Note that these bonuses are some what - // arbitrary and evolved over time by accident as much as because they are - // principled bonuses. - // FIXME: It would be nice to base the bonus values on something more - // scientific. - // // LstCallToStaticBonus: This large bonus is applied to ensure the inlining // of the last call to a static function as inlining such functions is // guaranteed to reduce code size. @@ -896,7 +887,7 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // These bonus percentages may be set to 0 based on properties of the caller // and the callsite. int SingleBBBonusPercent = 50; - int VectorBonusPercent = 150; + int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; // Lambda to set all the above bonus and bonus percentages to 0. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 50c5ae9c19fa..eb04c34453fb 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -176,6 +176,10 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const { return TTIImpl->getInliningThresholdMultiplier(); } +int TargetTransformInfo::getInlinerVectorBonusPercent() const { + return TTIImpl->getInlinerVectorBonusPercent(); +} + int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) const { return TTIImpl->getGEPCost(PointeeType, Ptr, Operands); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp index ec0dd6df44ab..f4df20b8f03e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -39,7 +39,7 @@ using namespace llvm; #define DEBUG_TYPE "inline" static cl::opt -ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500), cl::desc("Cost of alloca argument")); // If the amount of scratch memory to eliminate exceeds our ability to allocate diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 72882c83c01c..6f1bf5a26f0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -191,7 +191,9 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 9; } + unsigned getInliningThresholdMultiplier() { return 7; } + + int getInlinerVectorBonusPercent() { return 0; } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll index 75c16d006ae4..c2f1836f44af 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -28,15 +28,8 @@ if.end: ; preds = %if.then, %entry define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) { entry: %tmp1 = load float, float addrspace(5)* %p1, align 4 - %cmp = fcmp ogt float %tmp1, 1.000000e+00 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry %div = fdiv float 2.000000e+00, %tmp1 store float %div, float addrspace(5)* %p2, align 4 - br label %if.end - -if.end: ; preds = %if.then, %entry ret void } diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll new file mode 100644 index 000000000000..cf28d4fe4ab5 --- /dev/null +++ b/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s + +define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) { +entry: + %div.1 = udiv <16 x i32> %x, %y + %div.2 = udiv <16 x i32> %div.1, %y + %div.3 = udiv <16 x i32> %div.2, %y + %div.4 = udiv <16 x i32> %div.3, %y + %div.5 = udiv <16 x i32> %div.4, %y + %div.6 = udiv <16 x i32> %div.5, %y + %div.7 = udiv <16 x i32> %div.6, %y + %div.8 = udiv <16 x i32> %div.7, %y + %div.9 = udiv <16 x i32> %div.8, %y + %div.10 = udiv <16 x i32> %div.9, %y + %div.11 = udiv <16 x i32> %div.10, %y + %div.12 = udiv <16 x i32> %div.11, %y + ret <16 x i32> %div.12 +} + +; CHECK-LABEL: define amdgpu_kernel void @caller_vecbonus +; CHECK-NOT: udiv +; CHECK: tail call <16 x i32> @div_vecbonus +; CHECK: ret void +define amdgpu_kernel void @caller_vecbonus(<16 x i32> addrspace(1)* nocapture %x, <16 x i32> addrspace(1)* nocapture readonly %y) { +entry: + %tmp = load <16 x i32>, <16 x i32> addrspace(1)* %x + %tmp1 = load <16 x i32>, <16 x i32> addrspace(1)* %y + %div.i = tail call <16 x i32> @div_vecbonus(<16 x i32> %tmp, <16 x i32> %tmp1) + store <16 x i32> %div.i, <16 x i32> addrspace(1)* %x + ret void +}