forked from OSchip/llvm-project
[Partial Inliner] Compute intrinsic cost through TTI
https://bugs.llvm.org/show_bug.cgi?id=45932 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region") getting triggered in computeBBInlineCost. Intrinsics like "assume" are considered regular function calls while computing costs. This patch enables computeBBInlineCost to queries TTI for intrinsic call cost. Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D87132
This commit is contained in:
parent
855ec517a3
commit
01e2b394ee
|
@ -226,10 +226,13 @@ struct PartialInlinerImpl {
|
|||
// multi-region outlining.
|
||||
FunctionCloner(Function *F, FunctionOutliningInfo *OI,
|
||||
OptimizationRemarkEmitter &ORE,
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC);
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC,
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI);
|
||||
FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
|
||||
OptimizationRemarkEmitter &ORE,
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC);
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC,
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI);
|
||||
|
||||
~FunctionCloner();
|
||||
|
||||
// Prepare for function outlining: making sure there is only
|
||||
|
@ -266,6 +269,7 @@ struct PartialInlinerImpl {
|
|||
std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
|
||||
OptimizationRemarkEmitter &ORE;
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC;
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -334,7 +338,7 @@ private:
|
|||
// Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
|
||||
// approximate both the size and runtime cost (Note that in the current
|
||||
// inline cost analysis, there is no clear distinction there either).
|
||||
static int computeBBInlineCost(BasicBlock *BB);
|
||||
static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
|
||||
|
||||
std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
|
||||
std::unique_ptr<FunctionOutliningMultiRegionInfo>
|
||||
|
@ -448,9 +452,10 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
|
|||
|
||||
// Use the same computeBBInlineCost function to compute the cost savings of
|
||||
// the outlining the candidate region.
|
||||
TargetTransformInfo *FTTI = &GetTTI(*F);
|
||||
int OverallFunctionCost = 0;
|
||||
for (auto &BB : *F)
|
||||
OverallFunctionCost += computeBBInlineCost(&BB);
|
||||
OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (TracePartialInlining)
|
||||
|
@ -509,7 +514,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
|
|||
continue;
|
||||
int OutlineRegionCost = 0;
|
||||
for (auto *BB : DominateVector)
|
||||
OutlineRegionCost += computeBBInlineCost(BB);
|
||||
OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (TracePartialInlining)
|
||||
|
@ -843,7 +848,8 @@ bool PartialInlinerImpl::shouldPartialInline(
|
|||
// TODO: Ideally we should share Inliner's InlineCost Analysis code.
|
||||
// For now use a simplified version. The returned 'InlineCost' will be used
|
||||
// to esimate the size cost as well as runtime cost of the BB.
|
||||
int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
|
||||
int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
|
||||
TargetTransformInfo *TTI) {
|
||||
int InlineCost = 0;
|
||||
const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
|
||||
for (Instruction &I : BB->instructionsWithoutDebug()) {
|
||||
|
@ -866,6 +872,21 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
|
|||
if (I.isLifetimeStartOrEnd())
|
||||
continue;
|
||||
|
||||
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
|
||||
Intrinsic::ID IID = II->getIntrinsicID();
|
||||
SmallVector<Type *, 4> Tys;
|
||||
FastMathFlags FMF;
|
||||
for (Value *Val : II->args())
|
||||
Tys.push_back(Val->getType());
|
||||
|
||||
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
|
||||
IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
|
||||
InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
|
||||
InlineCost += getCallsiteCost(*CI, DL);
|
||||
continue;
|
||||
|
@ -893,11 +914,13 @@ PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
|
|||
BasicBlock* OutliningCallBB = FuncBBPair.second;
|
||||
// Now compute the cost of the call sequence to the outlined function
|
||||
// 'OutlinedFunction' in BB 'OutliningCallBB':
|
||||
OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
|
||||
auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
|
||||
OutliningFuncCallCost +=
|
||||
computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
|
||||
|
||||
// Now compute the cost of the extracted/outlined function itself:
|
||||
for (BasicBlock &BB : *OutlinedFunc)
|
||||
OutlinedFunctionCost += computeBBInlineCost(&BB);
|
||||
OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
|
||||
}
|
||||
assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
|
||||
"Outlined function cost should be no less than the outlined region");
|
||||
|
@ -962,8 +985,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
|
|||
|
||||
PartialInlinerImpl::FunctionCloner::FunctionCloner(
|
||||
Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC)
|
||||
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC,
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI)
|
||||
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
|
||||
ClonedOI = std::make_unique<FunctionOutliningInfo>();
|
||||
|
||||
// Clone the function, so that we can hack away on it.
|
||||
|
@ -987,8 +1011,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
|
|||
PartialInlinerImpl::FunctionCloner::FunctionCloner(
|
||||
Function *F, FunctionOutliningMultiRegionInfo *OI,
|
||||
OptimizationRemarkEmitter &ORE,
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC)
|
||||
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
|
||||
function_ref<AssumptionCache *(Function &)> LookupAC,
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI)
|
||||
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
|
||||
ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
|
||||
|
||||
// Clone the function, so that we can hack away on it.
|
||||
|
@ -1099,10 +1124,10 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
|
|||
|
||||
bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
|
||||
|
||||
auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
|
||||
auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
|
||||
int Cost = 0;
|
||||
for (BasicBlock* BB : Region)
|
||||
Cost += computeBBInlineCost(BB);
|
||||
Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
|
||||
return Cost;
|
||||
};
|
||||
|
||||
|
@ -1196,9 +1221,10 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
|
|||
|
||||
// Gather up the blocks that we're going to extract.
|
||||
std::vector<BasicBlock *> ToExtract;
|
||||
auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
|
||||
ToExtract.push_back(ClonedOI->NonReturnBlock);
|
||||
OutlinedRegionCost +=
|
||||
PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
|
||||
OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
|
||||
ClonedOI->NonReturnBlock, ClonedFuncTTI);
|
||||
for (BasicBlock &BB : *ClonedFunc)
|
||||
if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
|
||||
ToExtract.push_back(&BB);
|
||||
|
@ -1206,7 +1232,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
|
|||
// into the outlined function which may make the outlining
|
||||
// overhead (the difference of the outlined function cost
|
||||
// and OutliningRegionCost) look larger.
|
||||
OutlinedRegionCost += computeBBInlineCost(&BB);
|
||||
OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
|
||||
}
|
||||
|
||||
// Extract the body of the if.
|
||||
|
@ -1276,7 +1302,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
|
|||
std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
|
||||
computeOutliningColdRegionsInfo(F, ORE);
|
||||
if (OMRI) {
|
||||
FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
|
||||
FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (TracePartialInlining) {
|
||||
|
@ -1309,7 +1335,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
|
|||
if (!OI)
|
||||
return {false, nullptr};
|
||||
|
||||
FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
|
||||
FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
|
||||
Cloner.NormalizeReturnBlock();
|
||||
|
||||
Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
; RUN: opt -partial-inliner -S < %s | FileCheck %s
|
||||
|
||||
; Checks that valid costs are computed for intrinsic calls.
|
||||
; https://bugs.llvm.org/show_bug.cgi?id=45932
|
||||
|
||||
|
||||
@emit_notes = external global i8, align 2
|
||||
|
||||
; CHECK: var_reg_delete
|
||||
; CHECK-NEXT: bb
|
||||
; CHECK-NEXT: tail call void @delete_variable_part()
|
||||
; CHECK-NEXT: ret void
|
||||
|
||||
define void @var_reg_delete() {
|
||||
bb:
|
||||
tail call void @delete_variable_part()
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: delete_variable_part
|
||||
; CHECK-NEXT: bb
|
||||
; CHECK-NEXT: %tmp1.i = tail call i32 @find_variable_location_part()
|
||||
; CHECK-NEXT: %tmp3.i = icmp sgt i32 %tmp1.i, -1
|
||||
; CHECK-NEXT: br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit
|
||||
|
||||
; CHECK: bb4.i
|
||||
; CHECK-NEXT: %tmp.i.i = load i8, i8* @emit_notes
|
||||
; CHECK-NEXT: %tmp1.i.i = icmp ne i8 %tmp.i.i, 0
|
||||
; CHECK-NEXT: tail call void @llvm.assume(i1 %tmp1.i.i)
|
||||
; CHECK-NEXT: unreachable
|
||||
|
||||
; CHECK: delete_slot_part.exit
|
||||
; CHECK-NEXT: ret void
|
||||
|
||||
define void @delete_variable_part() {
|
||||
bb:
|
||||
%tmp1.i = tail call i32 @find_variable_location_part()
|
||||
%tmp3.i = icmp sgt i32 %tmp1.i, -1
|
||||
br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit
|
||||
|
||||
bb4.i:
|
||||
%tmp.i.i = load i8, i8* @emit_notes, align 2
|
||||
%tmp1.i.i = icmp ne i8 %tmp.i.i, 0
|
||||
tail call void @llvm.assume(i1 %tmp1.i.i)
|
||||
unreachable
|
||||
|
||||
delete_slot_part.exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: declare i32 @find_variable_location_part
|
||||
declare i32 @find_variable_location_part()
|
||||
|
||||
; CHECK: declare void @llvm.assume(i1 noundef)
|
||||
declare void @llvm.assume(i1 noundef)
|
Loading…
Reference in New Issue