[Partial Inliner] Compute intrinsic cost through TTI

https://bugs.llvm.org/show_bug.cgi?id=45932

assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region") getting triggered in computeBBInlineCost.

Intrinsics like "assume" are considered regular function calls while computing costs.
This patch enables computeBBInlineCost to queries TTI for intrinsic call cost.

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D87132
This commit is contained in:
Dangeti Tharun kumar 2020-09-16 15:11:24 +01:00 committed by Florian Hahn
parent 855ec517a3
commit 01e2b394ee
2 changed files with 100 additions and 19 deletions

View File

@ -226,10 +226,13 @@ struct PartialInlinerImpl {
// multi-region outlining.
FunctionCloner(Function *F, FunctionOutliningInfo *OI,
OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC);
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI);
FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC);
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI);
~FunctionCloner();
// Prepare for function outlining: making sure there is only
@ -266,6 +269,7 @@ struct PartialInlinerImpl {
std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
OptimizationRemarkEmitter &ORE;
function_ref<AssumptionCache *(Function &)> LookupAC;
function_ref<TargetTransformInfo &(Function &)> GetTTI;
};
private:
@ -334,7 +338,7 @@ private:
// Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
// approximate both the size and runtime cost (Note that in the current
// inline cost analysis, there is no clear distinction there either).
static int computeBBInlineCost(BasicBlock *BB);
static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
std::unique_ptr<FunctionOutliningMultiRegionInfo>
@ -448,9 +452,10 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
// Use the same computeBBInlineCost function to compute the cost savings of
// the outlining the candidate region.
TargetTransformInfo *FTTI = &GetTTI(*F);
int OverallFunctionCost = 0;
for (auto &BB : *F)
OverallFunctionCost += computeBBInlineCost(&BB);
OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
#ifndef NDEBUG
if (TracePartialInlining)
@ -509,7 +514,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
continue;
int OutlineRegionCost = 0;
for (auto *BB : DominateVector)
OutlineRegionCost += computeBBInlineCost(BB);
OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
#ifndef NDEBUG
if (TracePartialInlining)
@ -843,7 +848,8 @@ bool PartialInlinerImpl::shouldPartialInline(
// TODO: Ideally we should share Inliner's InlineCost Analysis code.
// For now use a simplified version. The returned 'InlineCost' will be used
// to esimate the size cost as well as runtime cost of the BB.
int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
TargetTransformInfo *TTI) {
int InlineCost = 0;
const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
for (Instruction &I : BB->instructionsWithoutDebug()) {
@ -866,6 +872,21 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
if (I.isLifetimeStartOrEnd())
continue;
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
Intrinsic::ID IID = II->getIntrinsicID();
SmallVector<Type *, 4> Tys;
FastMathFlags FMF;
for (Value *Val : II->args())
Tys.push_back(Val->getType());
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
FMF = FPMO->getFastMathFlags();
IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
continue;
}
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
InlineCost += getCallsiteCost(*CI, DL);
continue;
@ -893,11 +914,13 @@ PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
BasicBlock* OutliningCallBB = FuncBBPair.second;
// Now compute the cost of the call sequence to the outlined function
// 'OutlinedFunction' in BB 'OutliningCallBB':
OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
OutliningFuncCallCost +=
computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
// Now compute the cost of the extracted/outlined function itself:
for (BasicBlock &BB : *OutlinedFunc)
OutlinedFunctionCost += computeBBInlineCost(&BB);
OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
}
assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
"Outlined function cost should be no less than the outlined region");
@ -962,8 +985,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
PartialInlinerImpl::FunctionCloner::FunctionCloner(
Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC)
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI)
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
ClonedOI = std::make_unique<FunctionOutliningInfo>();
// Clone the function, so that we can hack away on it.
@ -987,8 +1011,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
PartialInlinerImpl::FunctionCloner::FunctionCloner(
Function *F, FunctionOutliningMultiRegionInfo *OI,
OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC)
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI)
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
// Clone the function, so that we can hack away on it.
@ -1099,10 +1124,10 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
int Cost = 0;
for (BasicBlock* BB : Region)
Cost += computeBBInlineCost(BB);
Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
return Cost;
};
@ -1196,9 +1221,10 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
// Gather up the blocks that we're going to extract.
std::vector<BasicBlock *> ToExtract;
auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
ToExtract.push_back(ClonedOI->NonReturnBlock);
OutlinedRegionCost +=
PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
ClonedOI->NonReturnBlock, ClonedFuncTTI);
for (BasicBlock &BB : *ClonedFunc)
if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
ToExtract.push_back(&BB);
@ -1206,7 +1232,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
// into the outlined function which may make the outlining
// overhead (the difference of the outlined function cost
// and OutliningRegionCost) look larger.
OutlinedRegionCost += computeBBInlineCost(&BB);
OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
}
// Extract the body of the if.
@ -1276,7 +1302,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
computeOutliningColdRegionsInfo(F, ORE);
if (OMRI) {
FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
#ifndef NDEBUG
if (TracePartialInlining) {
@ -1309,7 +1335,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
if (!OI)
return {false, nullptr};
FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
Cloner.NormalizeReturnBlock();
Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();

View File

@ -0,0 +1,55 @@
; RUN: opt -partial-inliner -S < %s | FileCheck %s
; Checks that valid costs are computed for intrinsic calls.
; https://bugs.llvm.org/show_bug.cgi?id=45932
@emit_notes = external global i8, align 2
; CHECK: var_reg_delete
; CHECK-NEXT: bb
; CHECK-NEXT: tail call void @delete_variable_part()
; CHECK-NEXT: ret void
define void @var_reg_delete() {
bb:
tail call void @delete_variable_part()
ret void
}
; CHECK: delete_variable_part
; CHECK-NEXT: bb
; CHECK-NEXT: %tmp1.i = tail call i32 @find_variable_location_part()
; CHECK-NEXT: %tmp3.i = icmp sgt i32 %tmp1.i, -1
; CHECK-NEXT: br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit
; CHECK: bb4.i
; CHECK-NEXT: %tmp.i.i = load i8, i8* @emit_notes
; CHECK-NEXT: %tmp1.i.i = icmp ne i8 %tmp.i.i, 0
; CHECK-NEXT: tail call void @llvm.assume(i1 %tmp1.i.i)
; CHECK-NEXT: unreachable
; CHECK: delete_slot_part.exit
; CHECK-NEXT: ret void
define void @delete_variable_part() {
bb:
%tmp1.i = tail call i32 @find_variable_location_part()
%tmp3.i = icmp sgt i32 %tmp1.i, -1
br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit
bb4.i:
%tmp.i.i = load i8, i8* @emit_notes, align 2
%tmp1.i.i = icmp ne i8 %tmp.i.i, 0
tail call void @llvm.assume(i1 %tmp1.i.i)
unreachable
delete_slot_part.exit:
ret void
}
; CHECK: declare i32 @find_variable_location_part
declare i32 @find_variable_location_part()
; CHECK: declare void @llvm.assume(i1 noundef)
declare void @llvm.assume(i1 noundef)