diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 329ac94e499f..5550376279f2 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined, static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial ininling")); +// This is an option used by testing: +static cl::opt SkipCostAnalysis("skip-partial-inlining-cost-analysis", + cl::init(false), cl::ZeroOrMore, + cl::ReallyHidden, + cl::desc("Skip Cost Analysis")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -53,6 +59,15 @@ static cl::opt MaxNumPartialInlining( "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of partial inlining. The default is unlimited")); +// Used only when PGO or user annotated branch data is absent. It is +// the least value that is used to weigh the outline region. If BFI +// produces larger value, the BFI value will be used. +static cl::opt + OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Relative frequency of outline region to " + "the entry block")); + namespace { struct FunctionOutliningInfo { @@ -84,8 +99,6 @@ struct PartialInlinerImpl { bool run(Module &M); Function *unswitchFunction(Function *F); - std::unique_ptr computeOutliningInfo(Function *F); - private: int NumPartialInlining = 0; std::function *GetAssumptionCache; @@ -93,11 +106,84 @@ private: Optional> GetBFI; ProfileSummaryInfo *PSI; - bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE); + // Return the frequency of the OutlininingBB relative to F's entry point. + // The result is no larger than 1 and is represented using BP. + // (Note that the outlined region's 'head' block can only have incoming + // edges from the guarding entry blocks). + BranchProbability getOutliningCallBBRelativeFreq(Function *F, + FunctionOutliningInfo *OI, + Function *DuplicateFunction, + BlockFrequencyInfo *BFI, + BasicBlock *OutliningCallBB); + + // Return true if the callee of CS should be partially inlined with + // profit. + bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, + BlockFrequencyInfo *CalleeBFI, + BasicBlock *OutliningCallBB, + int OutliningCallOverhead, + OptimizationRemarkEmitter &ORE); + + // Try to inline DuplicateFunction (cloned from F with call to + // the OutlinedFunction into its callers. Return true + // if there is any successful inlining. + bool tryPartialInline(Function *DuplicateFunction, + Function *F, /*orignal function */ + FunctionOutliningInfo *OI, Function *OutlinedFunction, + BlockFrequencyInfo *CalleeBFI); + + // Compute the mapping from use site of DuplicationFunction to the enclosing + // BB's profile count. + void computeCallsiteToProfCountMap(Function *DuplicateFunction, + DenseMap &SiteCountMap); + bool IsLimitReached() { return (MaxNumPartialInlining != -1 && NumPartialInlining >= MaxNumPartialInlining); } + + CallSite getCallSite(User *U) { + CallSite CS; + if (CallInst *CI = dyn_cast(U)) + CS = CallSite(CI); + else if (InvokeInst *II = dyn_cast(U)) + CS = CallSite(II); + else + llvm_unreachable("All uses must be calls"); + return CS; + } + + CallSite getOneCallSiteTo(Function *F) { + User *User = *F->user_begin(); + return getCallSite(User); + } + + std::tuple getOneDebugLoc(Function *F) { + CallSite CS = getOneCallSiteTo(F); + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + return std::make_tuple(DLoc, Block); + } + + // Returns the costs associated with function outlining: + // - The first value is the non-weighted runtime cost for making the call + // to the outlined function 'OutlinedFunction', including the addtional + // setup cost in the outlined function itself; + // - The second value is the estimated size of the new call sequence in + // basic block 'OutliningCallBB'; + // - The third value is the estimated size of the original code from + // function 'F' that is extracted into the outlined function. + std::tuple + computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, + Function *OutlinedFunction, + BasicBlock *OutliningCallBB); + // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to + // approximate both the size and runtime cost (Note that in the current + // inline cost analysis, there is no clear distinction there either). + int computeBBInlineCost(BasicBlock *BB); + + std::unique_ptr computeOutliningInfo(Function *F); + }; struct PartialInlinerLegacyPass : public ModulePass { @@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { // Do sanity check of the entries: threre should not // be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} - assert(OutliningInfo->Entries[0] == &F->front()); + assert(OutliningInfo->Entries[0] == &F->front() && + "Function Entry must be the first in Entries vector"); DenseSet Entries; for (BasicBlock *E : OutliningInfo->Entries) Entries.insert(E); @@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return OutliningInfo; } -bool PartialInlinerImpl::shouldPartialInline(CallSite CS, - OptimizationRemarkEmitter &ORE) { - // TODO : more sharing with shouldInline in Inliner.cpp +// Check if there is PGO data or user annoated branch data: +static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { + if (F->getEntryCount()) + return true; + // Now check if any of the entry block has MD_prof data: + for (auto *E : OI->Entries) { + BranchInst *BR = dyn_cast(E->getTerminator()); + if (!BR || BR->isUnconditional()) + continue; + uint64_t T, F; + if (BR->extractProfMetadata(T, F)) + return true; + } + return false; +} + +BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( + Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, + BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { + + auto EntryFreq = + BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); + auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); + + auto OutlineRegionRelFreq = + BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), + EntryFreq.getFrequency()); + + if (hasProfileData(F, OI)) + return OutlineRegionRelFreq; + + // When profile data is not available, we need to be very + // conservative in estimating the overall savings. We need to make sure + // the outline region relative frequency is not below the threshold + // specified by the option. + OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); + + return OutlineRegionRelFreq; +} + +bool PartialInlinerImpl::shouldPartialInline( + CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, + BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, + int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { using namespace ore; + if (SkipCostAnalysis) + return true; + Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -302,36 +433,166 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS, if (IC.isAlways()) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) - << NV("Callee", Callee) + << NV("Callee", F) << " should always be fully inlined, not partially"); return false; } if (IC.isNever()) { ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) - << NV("Callee", Callee) << " not partially inlined into " + << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because it should never be inlined (cost=never)"); return false; } if (!IC) { - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) - << NV("Callee", Callee) << " not partially inlined into " + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) + << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; } + const DataLayout &DL = Caller->getParent()->getDataLayout(); + // The savings of eliminating the call: + int NonWeightedSavings = getCallsiteCost(CS, DL); + BlockFrequency NormWeightedSavings(NonWeightedSavings); + + auto RelativeFreq = + getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); + auto NormWeightedRcost = + BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; + + // Weighted saving is smaller than weighted cost, return false + if (NormWeightedSavings < NormWeightedRcost) { + ORE.emit( + OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) + << NV("Callee", F) << " not partially inlined into " + << NV("Caller", Caller) << " runtime overhead (overhead=" + << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) + << ", savings=" + << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" + << " of making the outlined call is too high"); + + return false; + } ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) - << NV("Callee", Callee) << " can be partially inlined into " + << NV("Callee", F) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return true; } +// TODO: Ideally we should share Inliner's InlineCost Analysis code. +// For now use a simplified version. The returned 'InlineCost' will be used +// to esimate the size cost as well as runtime cost of the BB. +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { + int InlineCost = 0; + const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (isa(I)) + continue; + + if (CallInst *CI = dyn_cast(I)) { + InlineCost += getCallsiteCost(CallSite(CI), DL); + continue; + } + + if (InvokeInst *II = dyn_cast(I)) { + InlineCost += getCallsiteCost(CallSite(II), DL); + continue; + } + + if (SwitchInst *SI = dyn_cast(I)) { + InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; + continue; + } + InlineCost += InlineConstants::InstrCost; + } + return InlineCost; +} + +std::tuple PartialInlinerImpl::computeOutliningCosts( + Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, + BasicBlock *OutliningCallBB) { + // First compute the cost of the outlined region 'OI' in the original + // function 'F': + int OutlinedRegionCost = 0; + for (BasicBlock &BB : *F) { + if (&BB != OI->ReturnBlock && + // Assuming Entry set is small -- do a linear search here: + std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == + OI->Entries.end()) { + OutlinedRegionCost += computeBBInlineCost(&BB); + } + } + + // Now compute the cost of the call sequence to the outlined function + // 'OutlinedFunction' in BB 'OutliningCallBB': + int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); + + // Now compute the cost of the extracted/outlined function itself: + int OutlinedFunctionCost = 0; + for (BasicBlock &BB : *OutlinedFunction) { + OutlinedFunctionCost += computeBBInlineCost(&BB); + } + + assert(OutlinedFunctionCost >= OutlinedRegionCost && + "Outlined function cost should be no less than the outlined region"); + int OutliningRuntimeOverhead = + OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); + + return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, + OutlinedRegionCost); +} + +// Create the callsite to profile count map which is +// used to update the original function's entry count, +// after the function is partially inlined into the callsite. +void PartialInlinerImpl::computeCallsiteToProfCountMap( + Function *DuplicateFunction, + DenseMap &CallSiteToProfCountMap) { + std::vector Users(DuplicateFunction->user_begin(), + DuplicateFunction->user_end()); + Function *CurrentCaller = nullptr; + BlockFrequencyInfo *CurrentCallerBFI = nullptr; + + auto ComputeCurrBFI = [&,this](Function *Caller) { + // For the old pass manager: + if (!GetBFI) { + if (CurrentCallerBFI) + delete CurrentCallerBFI; + DominatorTree DT(*Caller); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*Caller, LI); + CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); + } else { + // New pass manager: + CurrentCallerBFI = &(*GetBFI)(*Caller); + } + }; + + for (User *User : Users) { + CallSite CS = getCallSite(User); + Function *Caller = CS.getCaller(); + if (CurrentCaller != Caller) { + CurrentCaller = Caller; + ComputeCurrBFI(Caller); + } else { + assert(CurrentCallerBFI && "CallerBFI is not set"); + } + BasicBlock *CallBB = CS.getInstruction()->getParent(); + auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); + if (Count) + CallSiteToProfCountMap[User] = *Count; + else + CallSiteToProfCountMap[User] = 0; + } +} + Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (F->hasAddressTaken()) @@ -347,21 +608,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (PSI->isFunctionEntryCold(F)) return nullptr; - std::unique_ptr OutliningInfo = - computeOutliningInfo(F); + if (F->user_begin() == F->user_end()) + return nullptr; - if (!OutliningInfo) + std::unique_ptr OI = computeOutliningInfo(F); + + if (!OI) return nullptr; // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; Function *DuplicateFunction = CloneFunction(F, VMap); - BasicBlock *NewReturnBlock = - cast(VMap[OutliningInfo->ReturnBlock]); - BasicBlock *NewNonReturnBlock = - cast(VMap[OutliningInfo->NonReturnBlock]); + BasicBlock *NewReturnBlock = cast(VMap[OI->ReturnBlock]); + BasicBlock *NewNonReturnBlock = cast(VMap[OI->NonReturnBlock]); DenseSet NewEntries; - for (BasicBlock *BB : OutliningInfo->Entries) { + for (BasicBlock *BB : OI->Entries) { NewEntries.insert(cast(VMap[BB])); } @@ -390,7 +651,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { BasicBlock *PreReturn = NewReturnBlock; // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); - unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size(); + unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( @@ -408,14 +669,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { Ins = NewReturnBlock->getFirstNonPHI(); RetPhi->addIncoming(&*I, PreReturn); - for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) { + for (BasicBlock *E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } ++I; } - for (auto E : OutliningInfo->ReturnBlockPreds) { + for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); } @@ -443,50 +704,107 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); // Extract the body of the if. - Function *ExtractedFunction = + Function *OutlinedFunction = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) .extractCodeRegion(); - // Inline the top-level if test into all callers. - std::vector Users(DuplicateFunction->user_begin(), - DuplicateFunction->user_end()); - - for (User *User : Users) { - CallSite CS; - if (CallInst *CI = dyn_cast(User)) - CS = CallSite(CI); - else if (InvokeInst *II = dyn_cast(User)) - CS = CallSite(II); - else - llvm_unreachable("All uses must be calls"); - - if (IsLimitReached()) - continue; - - OptimizationRemarkEmitter ORE(CS.getCaller()); - if (!shouldPartialInline(CS, ORE)) - continue; - - DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); - BasicBlock *Block = CS.getParent(); - ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block) - << ore::NV("Callee", F) << " partially inlined into " - << ore::NV("Caller", CS.getCaller())); - - InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); - InlineFunction(CS, IFI); - NumPartialInlining++; - // update stats - NumPartialInlined++; - } + bool AnyInline = + tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. DuplicateFunction->replaceAllUsesWith(F); DuplicateFunction->eraseFromParent(); + if (!AnyInline && OutlinedFunction) + OutlinedFunction->eraseFromParent(); + return OutlinedFunction; +} +bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, + Function *F, + FunctionOutliningInfo *OI, + Function *OutlinedFunction, + BlockFrequencyInfo *CalleeBFI) { + if (OutlinedFunction == nullptr) + return false; - return ExtractedFunction; + int NonWeightedRcost; + int SizeCost; + int OutlinedRegionSizeCost; + + auto OutliningCallBB = + getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); + + std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = + computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); + + // The call sequence to the outlined function is larger than the original + // outlined region size, it does not increase the chances of inlining + // 'F' with outlining (The inliner usies the size increase to model the + // the cost of inlining a callee). + if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { + OptimizationRemarkEmitter ORE(F); + DebugLoc DLoc; + BasicBlock *Block; + std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", + DLoc, Block) + << ore::NV("Function", F) + << " not partially inlined into callers (Original Size = " + << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) + << ", Size of call sequence to outlined function = " + << ore::NV("NewSize", SizeCost) << ")"); + return false; + } + + assert(F->user_begin() == F->user_end() && + "F's users should all be replaced!"); + std::vector Users(DuplicateFunction->user_begin(), + DuplicateFunction->user_end()); + + DenseMap CallSiteToProfCountMap; + if (F->getEntryCount()) + computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); + + auto CalleeEntryCount = F->getEntryCount(); + uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); + bool AnyInline = false; + for (User *User : Users) { + CallSite CS = getCallSite(User); + + if (IsLimitReached()) + continue; + + OptimizationRemarkEmitter ORE(CS.getCaller()); + + if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, + NonWeightedRcost, ORE)) + continue; + + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) + << ore::NV("Callee", F) << " partially inlined into " + << ore::NV("Caller", CS.getCaller())); + + InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); + InlineFunction(CS, IFI); + + // Now update the entry count: + if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { + uint64_t CallSiteCount = CallSiteToProfCountMap[User]; + CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); + } + + AnyInline = true; + NumPartialInlining++; + // Update the stats + NumPartialInlined++; + } + + if (AnyInline && CalleeEntryCount) + F->setEntryCount(CalleeEntryCountV); + + return AnyInline; } bool PartialInlinerImpl::run(Module &M) { diff --git a/llvm/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll b/llvm/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll index 509a4d7bfa18..8313cfac04ee 100644 --- a/llvm/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll +++ b/llvm/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This test checks to make sure that the CodeExtractor ; properly sets the entry count for the function that is diff --git a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll index 425e96973596..8e362080dc48 100644 --- a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll +++ b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This test checks to make sure that CodeExtractor updates ; the exit branch probabilities for multiple exit blocks. diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineAnd.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineAnd.ll index e981a5ba5816..d32d834d2df3 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineAnd.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineAnd.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -partial-inliner -S | FileCheck %s ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s -; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s ; Function Attrs: nounwind uwtable define i32 @bar(i32 %arg) local_unnamed_addr #0 { diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll new file mode 100644 index 000000000000..3a7a9752e507 --- /dev/null +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s +; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s + +define i32 @Func(i1 %cond, i32* align 4 %align.val) !prof !1 { +; CHECK: @Func({{.*}}) !prof [[REMAINCOUNT:![0-9]+]] +entry: + br i1 %cond, label %if.then, label %return +if.then: + ; Dummy store to have more than 0 uses + store i32 10, i32* %align.val, align 4 + br label %return +return: ; preds = %entry + ret i32 0 +} + +define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{ +entry: +; CHECK-LABEL: @Caller1 +; CHECK: br +; CHECK: call void @Func.1_ +; CHECK: br +; CHECK: call void @Func.1_ + %val = call i32 @Func(i1 %cond, i32* %align.val) + %val2 = call i32 @Func(i1 %cond, i32* %align.val) + ret i32 %val +} + +define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{ +entry: +; CHECK-LABEL: @Caller2 +; CHECK: br +; CHECK: call void @Func.1_ + %val = call i32 @Func(i1 %cond, i32* %align.val) + ret i32 %val +} + +; CHECK: [[REMAINCOUNT]] = !{!"function_entry_count", i64 150} +!1 = !{!"function_entry_count", i64 200} +!2 = !{!"function_entry_count", i64 10} +!3 = !{!"function_entry_count", i64 20} + diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineHighCost.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineHighCost.ll new file mode 100644 index 000000000000..e43a94dc6c37 --- /dev/null +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineHighCost.ll @@ -0,0 +1,107 @@ +; The outlined region has high frequency and the outlining +; call sequence is expensive (input, output, multiple exit etc) +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s + + +; Function Attrs: nounwind +define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = icmp slt i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb16, !prof !1 + +bb1: ; preds = %bb + %tmp2 = tail call i32 (...) @foo() #0 + %tmp3 = tail call i32 (...) @foo() #0 + %tmp4 = tail call i32 (...) @foo() #0 + %tmp5 = tail call i32 (...) @foo() #0 + %tmp6 = tail call i32 (...) @foo() #0 + %tmp7 = tail call i32 (...) @foo() #0 + %tmp8 = add nsw i32 %arg, 1 + %tmp9 = tail call i32 @goo(i32 %tmp8) #0 + %tmp10 = tail call i32 (...) @foo() #0 + %tmp11 = icmp eq i32 %tmp10, 0 + br i1 %tmp11, label %bb12, label %bb16 + +bb12: ; preds = %bb1 + %tmp13 = tail call i32 (...) @foo() #0 + %tmp14 = icmp eq i32 %tmp13, 0 + %tmp15 = select i1 %tmp14, i32 0, i32 3 + br label %bb16 + +bb16: ; preds = %bb12, %bb1, %bb + %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ] + ret i32 %tmp17 +} + +define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = icmp slt i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb16, !prof !2 + +bb1: ; preds = %bb + %tmp2 = tail call i32 (...) @foo() #0 + %tmp3 = tail call i32 (...) @foo() #0 + %tmp4 = tail call i32 (...) @foo() #0 + %tmp5 = tail call i32 (...) @foo() #0 + %tmp6 = tail call i32 (...) @foo() #0 + %tmp7 = tail call i32 (...) @foo() #0 + %tmp8 = add nsw i32 %arg, 1 + %tmp9 = tail call i32 @goo(i32 %tmp8) #0 + %tmp10 = tail call i32 (...) @foo() #0 + %tmp11 = icmp eq i32 %tmp10, 0 + br i1 %tmp11, label %bb12, label %bb16 + +bb12: ; preds = %bb1 + %tmp13 = tail call i32 (...) @foo() #0 + %tmp14 = icmp eq i32 %tmp13, 0 + %tmp15 = select i1 %tmp14, i32 0, i32 3 + br label %bb16 + +bb16: ; preds = %bb12, %bb1, %bb + %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ] + ret i32 %tmp17 +} + +; Function Attrs: nounwind +declare i32 @foo(...) local_unnamed_addr #0 + +; Function Attrs: nounwind +declare i32 @goo(i32) local_unnamed_addr #0 + +; Function Attrs: nounwind +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +bb: +; CHECK-LABEL: @dummy_caller +; CHECK-NOT: br i1 +; CHECK-NOT: call{{.*}}bar_hot_outline_region. +; NOCOST-LABEL: @dummy_caller +; NOCOST: br i1 +; NOCOST: call{{.*}}bar_hot_outline_region. + + %tmp = tail call i32 @bar_hot_outline_region(i32 %arg) + ret i32 %tmp +} + +define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 { +bb: +; CHECK-LABEL: @dummy_caller2 +; CHECK: br i1 +; CHECK: call{{.*}}bar_cold_outline_region. +; NOCOST-LABEL: @dummy_caller2 +; NOCOST: br i1 +; NOCOST: call{{.*}}bar_cold_outline_region. + + %tmp = tail call i32 @bar_cold_outline_region(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 5.0.0 (trunk 301898)"} +!1 = !{!"branch_weights", i32 2000, i32 1} +!2 = !{!"branch_weights", i32 1, i32 100} diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineOr.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineOr.ll index 5408b4faaf70..758945c7ade5 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineOr.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineOr.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s -; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll index 282d300fadb9..fb6d1c335361 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -partial-inliner -S | FileCheck %s ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s -; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s +; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s diff --git a/llvm/test/Transforms/CodeExtractor/SingleCondition.ll b/llvm/test/Transforms/CodeExtractor/SingleCondition.ll index 90cda889a21b..4110cd95b7ee 100644 --- a/llvm/test/Transforms/CodeExtractor/SingleCondition.ll +++ b/llvm/test/Transforms/CodeExtractor/SingleCondition.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s -; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s +; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s +; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) { entry: diff --git a/llvm/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll b/llvm/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll index 41d883c8c378..0f8a71907d85 100644 --- a/llvm/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll +++ b/llvm/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner | llc -filetype=null -; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This testcase checks to see if CodeExtractor properly inherits ; target specific attributes for the extracted function. This can ; cause certain instructions that depend on the attributes to not