Infrastructure for PGO enhancements in inliner

This patch provides the following infrastructure for PGO enhancements in inliner: Enable the use of block level profile information in inliner Incremental update of block frequency information during inlining Update the function entry counts of callees when they get inlined into callers. Differential Revision: http://reviews.llvm.org/D16381 llvm-svn: 262636
2016-03-03 18:26:33 +00:00 · 2016-03-03 18:26:33 +00:00 · 3035719c86
parent abcee45b7a
commit 3035719c86
11 changed files with 433 additions and 52 deletions
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@ -20,6 +20,7 @@

 namespace llvm {
 class AssumptionCacheTracker;
+class BlockFrequencyInfo;
 class CallSite;
 class DataLayout;
 class Function;
@ -38,6 +39,21 @@ namespace InlineConstants {
  const unsigned TotalAllocaSizeRecursiveCaller = 1024;
 }

+/// \brief Block frequency analysis for multiple functions.
+/// This class mimics block frequency analysis on CGSCC level. Block frequency
+/// info is computed on demand and cached unless they are invalidated.
+class BlockFrequencyAnalysis {
+private:
+  DenseMap<Function *, BlockFrequencyInfo *> BFM;
+
+public:
+  ~BlockFrequencyAnalysis();
+  /// \brief Returns BlockFrequencyInfo for a function.
+  BlockFrequencyInfo *getBlockFrequencyInfo(Function *);
+  /// \brief Invalidates block frequency info for a function.
+  void invalidateBlockFrequencyInfo(Function *);
+};
+
 /// \brief Represents the cost of inlining a function.
 ///
 /// This supports special values for functions which should "always" or
@ -111,7 +127,8 @@ public:
 /// inlining the callsite. It is an expensive, heavyweight call.
 InlineCost getInlineCost(CallSite CS, int DefaultThreshold,
                         TargetTransformInfo &CalleeTTI,
-                         AssumptionCacheTracker *ACT);
+                         AssumptionCacheTracker *ACT,
+                         BlockFrequencyAnalysis *BFA);

 /// \brief Get an InlineCost with the callee explicitly specified.
 /// This allows you to calculate the cost of inlining a function via a
@ -120,7 +137,8 @@ InlineCost getInlineCost(CallSite CS, int DefaultThreshold,
 //
 InlineCost getInlineCost(CallSite CS, Function *Callee, int DefaultThreshold,
                         TargetTransformInfo &CalleeTTI,
-                         AssumptionCacheTracker *ACT);
+                         AssumptionCacheTracker *ACT,
+                         BlockFrequencyAnalysis *BFA);

 int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel);

@ -129,6 +147,9 @@ int getDefaultInlineThreshold();

 /// \brief Minimal filter to detect invalid constructs for inlining.
 bool isInlineViable(Function &Callee);
+
+/// \brief Return estimated count of the block \p BB.
+Optional<uint64_t> getBlockCount(BasicBlock *BB, BlockFrequencyAnalysis *BFA);
 }

 #endif
--- a/llvm/include/llvm/Transforms/IPO/InlinerPass.h
+++ b/llvm/include/llvm/Transforms/IPO/InlinerPass.h
@ -24,8 +24,18 @@ class AssumptionCacheTracker;
 class CallSite;
 class DataLayout;
 class InlineCost;
+class BlockFrequencyAnalysis;
 template <class PtrType, unsigned SmallSize> class SmallPtrSet;

+// Functor invoked when a block is cloned during inlining.
+typedef std::function<void(const BasicBlock *, const BasicBlock *)>
+    BlockCloningFunctor;
+// Functor invoked when a function is inlined inside the basic block
+// containing the call.
+typedef std::function<void(BasicBlock *, Function *)> FunctionCloningFunctor;
+// Functor invoked when a function gets deleted during inlining.
+typedef std::function<void(Function *)> FunctionDeletedFunctor;
+
 /// Inliner - This class contains all of the helper code which is used to
 /// perform the inlining operations that do not depend on the policy.
 ///
@ -69,9 +79,28 @@ private:
  /// shouldInline - Return true if the inliner should attempt to
  /// inline at the given CallSite.
  bool shouldInline(CallSite CS);
+  /// Set the BFI of \p Dst to be the same as \p Src.
+  void copyBlockFrequency(BasicBlock *Src, BasicBlock *Dst);
+  /// Invalidates BFI for function \p F.
+  void invalidateBFI(Function *F);
+  /// Invalidates BFI for all functions in  \p SCC.
+  void invalidateBFI(CallGraphSCC &SCC);
+  /// Update function entry count for \p Callee which has been inlined into
+  /// \p CallBB.
+  void updateEntryCount(BasicBlock *CallBB, Function *Callee);
+  /// \brief Update block frequency of an inlined block.
+  /// This method updates the block frequency of \p NewBB which is a clone of
+  /// \p OrigBB when the callsite \p CS gets inlined. The frequency of \p NewBB
+  /// is computed as follows:
+  /// Freq(NewBB) = Freq(OrigBB) * CallSiteFreq / CalleeEntryFreq.
+  void updateBlockFreq(CallSite &CS, const BasicBlock *OrigBB,
+                       const BasicBlock *NewBB);

 protected:
  AssumptionCacheTracker *ACT;
+  std::unique_ptr<BlockFrequencyAnalysis> BFA;
+  /// Are we using profile guided optimization?
+  bool HasProfileData;
 };

 } // End llvm namespace
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@ -48,6 +48,9 @@ class AllocaInst;
 class AssumptionCacheTracker;
 class DominatorTree;

+typedef std::function<void(const BasicBlock *, const BasicBlock *)>
+    BlockCloningFunctor;
+
 /// Return an exact copy of the specified module
 ///
 std::unique_ptr<Module> CloneModule(const Module *M);
@ -157,7 +160,8 @@ void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                               ValueToValueMapTy &VMap, bool ModuleLevelChanges,
                               SmallVectorImpl<ReturnInst *> &Returns,
                               const char *NameSuffix = "",
-                               ClonedCodeInfo *CodeInfo = nullptr);
+                               ClonedCodeInfo *CodeInfo = nullptr,
+                               BlockCloningFunctor Ftor = nullptr);

 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
 /// except that it does some simple constant prop and DCE on the fly.  The
@ -172,23 +176,27 @@ void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 ///
 void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                               ValueToValueMapTy &VMap, bool ModuleLevelChanges,
-                               SmallVectorImpl<ReturnInst*> &Returns,
+                               SmallVectorImpl<ReturnInst *> &Returns,
                               const char *NameSuffix = "",
                               ClonedCodeInfo *CodeInfo = nullptr,
-                               Instruction *TheCall = nullptr);
+                               Instruction *TheCall = nullptr,
+                               BlockCloningFunctor Ftor = nullptr);

 /// InlineFunctionInfo - This class captures the data input to the
 /// InlineFunction call, and records the auxiliary results produced by it.
 class InlineFunctionInfo {
 public:
  explicit InlineFunctionInfo(CallGraph *cg = nullptr,
-                              AssumptionCacheTracker *ACT = nullptr)
-      : CG(cg), ACT(ACT) {}
+                              AssumptionCacheTracker *ACT = nullptr,
+                              BlockCloningFunctor Ftor = nullptr)
+      : CG(cg), ACT(ACT), Ftor(Ftor) {}

  /// CG - If non-null, InlineFunction will update the callgraph to reflect the
  /// changes it makes.
  CallGraph *CG;
  AssumptionCacheTracker *ACT;
+  // Functor that is invoked when a block is cloned into the new function.
+  BlockCloningFunctor Ftor;

  /// StaticAllocas - InlineFunction fills this in with all static allocas that
  /// get copied into the caller.
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@ -18,13 +18,18 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/InstVisitor.h"
@ -85,6 +90,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
  // easily cacheable. Instead, use the cover function paramHasAttr.
  CallSite CandidateCS;

+  BlockFrequencyAnalysis *BFA;
  int Threshold;
  int Cost;

@ -153,6 +159,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
  /// passed to support analyzing indirect calls whose target is inferred by
  /// analysis.
  void updateThreshold(CallSite CS, Function &Callee);
+  /// Adjust Threshold based on CallSiteCount and return the adjusted threshold.
+  int getAdjustedThreshold(int Threshold, Optional<uint64_t> CallSiteCount);

  // Custom analysis routines.
  bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
@ -194,17 +202,19 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {

 public:
  CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
-               Function &Callee, int Threshold, CallSite CSArg)
-    : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
-        Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
-        ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+               Function &Callee, int Threshold, CallSite CSArg,
+               BlockFrequencyAnalysis *BFA)
+      : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), BFA(BFA),
+        Threshold(Threshold), Cost(0), IsCallerRecursive(false),
+        IsRecursiveCall(false), ExposesReturnsTwice(false),
+        HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
+        HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
+        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
+        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}

  bool analyzeCall(CallSite CS);

@ -572,6 +582,15 @@ bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
  return false;
 }

+// Adjust the threshold based on callsite hotness. Currently this is a nop.
+int CallAnalyzer::getAdjustedThreshold(int Threshold,
+                                       Optional<uint64_t> CallSiteCount
+                                       __attribute__((unused))) {
+  // FIXME: The new threshold should be computed from the given Threshold and
+  // the callsite hotness.
+  return Threshold;
+}
+
 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
  // If -inline-threshold is not given, listen to the optsize and minsize
  // attributes when they would decrease the threshold.
@ -596,6 +615,9 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
    FunctionCount = Callee.getEntryCount().getValue();
    MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue();
  }
+  Optional<uint64_t> CallSiteCount =
+      llvm::getBlockCount(CS.getInstruction()->getParent(), BFA);
+  Threshold = getAdjustedThreshold(Threshold, CallSiteCount);

  // Listen to the inlinehint attribute or profile based hotness information
  // when it would increase the threshold and the caller does not need to
@ -912,7 +934,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
  // during devirtualization and so we want to give it a hefty bonus for
  // inlining, but cap that bonus in the event that inlining wouldn't pan
  // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
+  CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS,
+                  BFA);
  if (CA.analyzeCall(CS)) {
    // We were able to inline the indirect call! Subtract the cost from the
    // threshold to get the bonus we want to apply, but don't go below zero.
@ -1433,9 +1456,10 @@ static bool functionsHaveCompatibleAttributes(Function *Caller,

 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
                               TargetTransformInfo &CalleeTTI,
-                               AssumptionCacheTracker *ACT) {
+                               AssumptionCacheTracker *ACT,
+                               BlockFrequencyAnalysis *BFA) {
  return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
-                       ACT);
+                       ACT, BFA);
 }

 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
@ -1454,7 +1478,8 @@ int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
                               int DefaultThreshold,
                               TargetTransformInfo &CalleeTTI,
-                               AssumptionCacheTracker *ACT) {
+                               AssumptionCacheTracker *ACT,
+                               BlockFrequencyAnalysis *BFA) {

  // Cannot inline indirect calls.
  if (!Callee)
@ -1487,7 +1512,7 @@ InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
        << "...\n");

-  CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS);
+  CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS, BFA);
  bool ShouldInline = CA.analyzeCall(CS);

  DEBUG(CA.dump());
@ -1535,3 +1560,45 @@ bool llvm::isInlineViable(Function &F) {

  return true;
 }
+
+/// \brief Get estimated execution count for \p BB.
+Optional<uint64_t> llvm::getBlockCount(BasicBlock *BB,
+                                       BlockFrequencyAnalysis *BFA) {
+  if (!BFA)
+    return None;
+  Function *F = BB->getParent();
+  Optional<uint64_t> EntryCount = F->getEntryCount();
+  if (!EntryCount)
+    return None;
+  BlockFrequencyInfo *BFI = BFA->getBlockFrequencyInfo(F);
+  uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency();
+  uint64_t FunctionEntryFreq = BFI->getEntryFreq();
+  uint64_t BBCount = EntryCount.getValue() * BBFreq / FunctionEntryFreq;
+  return BBCount;
+}
+
+BlockFrequencyAnalysis::~BlockFrequencyAnalysis() {
+  for (auto &Entry : BFM) {
+    delete Entry.second;
+  }
+}
+
+/// \brief Get BlockFrequencyInfo for a function.
+BlockFrequencyInfo *BlockFrequencyAnalysis::getBlockFrequencyInfo(Function *F) {
+  auto Iter = BFM.find(F);
+  if (Iter != BFM.end())
+    return Iter->second;
+  // We need to create a BlockFrequencyInfo object for F and store it.
+  DominatorTree DT;
+  DT.recalculate(*F);
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*F, LI);
+  BlockFrequencyInfo *BFI = new BlockFrequencyInfo(*F, BPI, LI);
+  BFM[F] = BFI;
+  return BFI;
+}
+
+/// \brief Invalidate BlockFrequencyInfo for a function.
+void BlockFrequencyAnalysis::invalidateBlockFrequencyInfo(Function *F) {
+  BFM.erase(F);
+}
--- a/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/llvm/lib/Transforms/IPO/InlineSimple.cpp
@ -59,7 +59,8 @@ public:
  InlineCost getInlineCost(CallSite CS) override {
    Function *Callee = CS.getCalledFunction();
    TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
-    return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT);
+    return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT,
+                               HasProfileData ? BFA.get() : nullptr);
  }

  bool runOnSCC(CallGraphSCC &SCC) override;
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@ -19,6 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@ -47,10 +48,13 @@ STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 // if those would be more profitable and blocked inline steps.
 STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");

-Inliner::Inliner(char &ID) : CallGraphSCCPass(ID), InsertLifetime(true) {}
+Inliner::Inliner(char &ID)
+    : CallGraphSCCPass(ID), InsertLifetime(true),
+      BFA(new BlockFrequencyAnalysis()) {}

 Inliner::Inliner(char &ID, bool InsertLifetime)
-    : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
+    : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime),
+      BFA(new BlockFrequencyAnalysis()) {}

 /// For this class, we declare that we require and preserve the call graph.
 /// If the derived class implements this method, it should
@ -259,7 +263,7 @@ bool Inliner::shouldInline(CallSite CS) {
                         Twine(IC.getCostDelta() + IC.getCost()) + ")");
    return false;
  }
-  
+
  // Try to detect the case where the current inlining candidate caller (call
  // it B) is a static or linkonce-ODR function and is an inlining candidate
  // elsewhere, and the current candidate callee (call it C) is large enough
@ -356,8 +360,90 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
  return false;
 }

+/// \brief Update the frequency of a block that is cloned into the caller.
+/// This is invoked when \p OrigBB from the callee is cloned into \p NewBB in
+/// the caller.
+void Inliner::updateBlockFreq(CallSite &CS, const BasicBlock *OrigBB,
+                              const BasicBlock *NewBB) {
+  if (!HasProfileData)
+    return;
+  Instruction *Call = CS.getInstruction();
+  BasicBlock *CallBB = Call->getParent();
+  BlockFrequencyInfo *CalleeBFI =
+      BFA->getBlockFrequencyInfo(CS.getCalledFunction());
+  BlockFrequencyInfo *CallerBFI =
+      BFA->getBlockFrequencyInfo(CallBB->getParent());
+  // Find the number of times OrigBB is executed per invocation of the callee
+  // and multiply by the number of times callee is executed in the caller.
+  // Freq(NewBB) = Freq(OrigBB) * CallSiteFreq / CalleeEntryFreq.
+  uint64_t CallSiteFreq = CallerBFI->getBlockFreq(CallBB).getFrequency();
+  uint64_t CalleeEntryFreq = CalleeBFI->getEntryFreq();
+  // Frequency of OrigBB in the callee.
+  BlockFrequency OrigBBFreq = CalleeBFI->getBlockFreq(OrigBB);
+  CallerBFI->setBlockFreq(NewBB, (double)(OrigBBFreq.getFrequency()) /
+                                     CalleeEntryFreq * CallSiteFreq);
+}
+
+/// \brief Update entry count of \p Callee after it got inlined at a callsite
+/// in block \p CallBB.
+void Inliner::updateEntryCount(BasicBlock *CallBB, Function *Callee) {
+  if (!HasProfileData)
+    return;
+  // If the callee has a original count of N, and the estimated count of
+  // callsite is M, the new callee count is set to N - M. M is estimated from
+  // the caller's entry count, its entry block frequency and the block frequency
+  // of the callsite.
+  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  if (!CalleeCount)
+    return;
+  Optional<uint64_t> CallSiteCount = llvm::getBlockCount(CallBB, BFA.get());
+  if (!CallSiteCount)
+    return;
+  // Since CallSiteCount is an estimate, it could exceed the original callee
+  // count and has to be set to 0.
+  if (CallSiteCount.getValue() > CalleeCount.getValue()) {
+    Callee->setEntryCount(0);
+    DEBUG(llvm::dbgs() << "Estimated count of block " << CallBB->getName()
+                       << " is " << CallSiteCount.getValue()
+                       << " which exceeds the entry count "
+                       << CalleeCount.getValue() << " of the callee "
+                       << Callee->getName() << "\n");
+  } else
+    Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue());
+}
+
+void Inliner::invalidateBFI(Function *F) {
+  if (!HasProfileData)
+    return;
+  if (F)
+    BFA->invalidateBlockFrequencyInfo(F);
+}
+void Inliner::invalidateBFI(CallGraphSCC &SCC) {
+  if (!HasProfileData)
+    return;
+  for (CallGraphNode *Node : SCC) {
+    Function *F = Node->getFunction();
+    invalidateBFI(F);
+  }
+}
+void Inliner::copyBlockFrequency(BasicBlock *Src, BasicBlock *Dst) {
+  if (!HasProfileData)
+    return;
+  Function *F = Src->getParent();
+  BlockFrequencyInfo *BFI = BFA->getBlockFrequencyInfo(F);
+  BFI->setBlockFreq(Dst, BFI->getBlockFreq(Src).getFrequency());
+}
+
+static bool hasProfileData(Module &M) {
+  // We check for the presence of MaxFunctionCount in the module.
+  // FIXME: This now only works for frontend based instrumentation.
+  return M.getMaximumFunctionCount().hasValue();
+}
+
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
+  using namespace std::placeholders;
  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  HasProfileData = hasProfileData(CG.getModule());
  ACT = &getAnalysis<AssumptionCacheTracker>();
  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();

@ -419,7 +505,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {

  
  InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, ACT);

  // Now that we have all of the call sites, loop over them and inline them if
  // it looks profitable to do so.
@ -448,6 +533,10 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
        CS.getInstruction()->eraseFromParent();
        ++NumCallsDeleted;
      } else {
+        Instruction *TheCall = CS.getInstruction();
+        BasicBlock *CallSiteBlock = TheCall->getParent();
+        Instruction *CallSuccessor = &*(++BasicBlock::iterator(TheCall));
+
        // We can only inline direct calls to non-declarations.
        if (!Callee || Callee->isDeclaration()) continue;
      
@ -476,6 +565,11 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
          continue;
        }

+        BlockCloningFunctor BCF = nullptr;
+        if (HasProfileData)
+          BCF = std::bind(&Inliner::updateBlockFreq, this, CS, _1, _2);
+        InlineFunctionInfo InlineInfo(&CG, ACT, BCF);
+
        // Attempt to inline the function.
        if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas,
                                  InlineHistoryID, InsertLifetime)) {
@ -485,6 +579,13 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
                                             Caller->getName()));
          continue;
        }
+        updateEntryCount(CallSiteBlock, Callee);
+        // The instruction following the call is part of a new basic block
+        // created during the inlining process. This does not have an entry in
+        // the BFI. We create an entry by copying the frequency of the original
+        // block containing the call.
+        copyBlockFrequency(CallSiteBlock, CallSuccessor->getParent());
+
        ++NumInlined;

        // Report the inline decision.
@ -523,7 +624,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
        CalleeNode->removeAllCalledFunctions();
        
        // Removing the node for callee from the call graph and delete it.
-        delete CG.removeFunctionFromModule(CalleeNode);
+        Function *F = CG.removeFunctionFromModule(CalleeNode);
+        invalidateBFI(F);
+        delete F;
        ++NumDeleted;
      }

@ -544,6 +647,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
    }
  } while (LocalChange);

+  invalidateBFI(SCC);
  return Changed;
 }

@ -651,7 +755,9 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
                                      FunctionsToRemove.end()),
                          FunctionsToRemove.end());
  for (CallGraphNode *CGN : FunctionsToRemove) {
-    delete CG.removeFunctionFromModule(CGN);
+    Function *F = CG.removeFunctionFromModule(CGN);
+    invalidateBFI(F);
+    delete F;
    ++NumDeleted;
  }
  return true;
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@ -277,9 +277,10 @@ namespace {

    /// The specified block is found to be reachable, clone it and
    /// anything that it can reach.
-    void CloneBlock(const BasicBlock *BB, 
+    void CloneBlock(const BasicBlock *BB,
                    BasicBlock::const_iterator StartingInst,
-                    std::vector<const BasicBlock*> &ToClone);
+                    std::vector<const BasicBlock *> &ToClone,
+                    BlockCloningFunctor Ftor = nullptr);
  };
 }

@ -287,7 +288,8 @@ namespace {
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                       BasicBlock::const_iterator StartingInst,
-                                       std::vector<const BasicBlock*> &ToClone){
+                                       std::vector<const BasicBlock *> &ToClone,
+                                       BlockCloningFunctor Ftor) {
  WeakVH &BBEntry = VMap[BB];

  // Have we already cloned this block?
@ -424,18 +426,19 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
      BB != &BB->getParent()->front();
  }
+  // Call Ftor to tell BB has been cloned to NewBB
+  if (Ftor)
+    Ftor(BB, NewBB);
 }

 /// This works like CloneAndPruneFunctionInto, except that it does not clone the
 /// entire function. Instead it starts at an instruction provided by the caller
 /// and copies (and prunes) only the code reachable from that instruction.
-void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
-                                     const Instruction *StartingInst,
-                                     ValueToValueMapTy &VMap,
-                                     bool ModuleLevelChanges,
-                                     SmallVectorImpl<ReturnInst *> &Returns,
-                                     const char *NameSuffix,
-                                     ClonedCodeInfo *CodeInfo) {
+void llvm::CloneAndPruneIntoFromInst(
+    Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst,
+    ValueToValueMapTy &VMap, bool ModuleLevelChanges,
+    SmallVectorImpl<ReturnInst *> &Returns, const char *NameSuffix,
+    ClonedCodeInfo *CodeInfo, BlockCloningFunctor Ftor) {
  assert(NameSuffix && "NameSuffix cannot be null!");

  ValueMapTypeRemapper *TypeMapper = nullptr;
@ -461,11 +464,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,

  // Clone the entry block, and anything recursively reachable from it.
  std::vector<const BasicBlock*> CloneWorklist;
-  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
+  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist, Ftor);
  while (!CloneWorklist.empty()) {
    const BasicBlock *BB = CloneWorklist.back();
    CloneWorklist.pop_back();
-    PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
+    PFC.CloneBlock(BB, BB->begin(), CloneWorklist, Ftor);
  }
  
  // Loop over all of the basic blocks in the old function.  If the block was
@ -667,15 +670,14 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 /// constant arguments cause a significant amount of code in the callee to be
 /// dead.  Since this doesn't produce an exact copy of the input, it can't be
 /// used for things like CloneFunction or CloneModule.
-void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                                     ValueToValueMapTy &VMap,
-                                     bool ModuleLevelChanges,
-                                     SmallVectorImpl<ReturnInst*> &Returns,
-                                     const char *NameSuffix, 
-                                     ClonedCodeInfo *CodeInfo,
-                                     Instruction *TheCall) {
+void llvm::CloneAndPruneFunctionInto(
+    Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap,
+    bool ModuleLevelChanges, SmallVectorImpl<ReturnInst *> &Returns,
+    const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall,
+    BlockCloningFunctor Ftor) {
  CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
-                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
+                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo,
+                            Ftor);
 }

 /// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@ -1319,7 +1319,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,

  // If IFI has any state in it, zap it before we fill it in.
  IFI.reset();
-  
+
  const Function *CalledFunc = CS.getCalledFunction();
  if (!CalledFunc ||              // Can't inline external function or indirect
      CalledFunc->isDeclaration() || // call, or call to a vararg function!
@ -1486,7 +1486,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
    // happy with whatever the cloner can do.
    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                              /*ModuleLevelChanges=*/false, Returns, ".i",
-                              &InlinedFunctionInfo, TheCall);
+                              &InlinedFunctionInfo, TheCall, IFI.Ftor);

    // Remember the first block that is newly cloned over.
    FirstNewBlock = LastBlock; ++FirstNewBlock;
--- a/llvm/test/Transforms/Inline/function-count-update-2.ll
+++ b/llvm/test/Transforms/Inline/function-count-update-2.ll
@ -0,0 +1,27 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+; This tests that the function count of a callee gets correctly updated after it
+; has been inlined into a two callsites.
+
+; CHECK: @callee() !prof [[COUNT:![0-9]+]]
+define i32 @callee() !prof !1 {
+  ret i32 0
+}
+
+define i32 @caller1() !prof !2 {
+  %i = call i32 @callee()
+  ret i32 %i
+}
+
+define i32 @caller2() !prof !3 {
+  %i = call i32 @callee()
+  ret i32 %i
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0}
+!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"function_entry_count", i64 600}
+!3 = !{!"function_entry_count", i64 400}
+
--- a/llvm/test/Transforms/Inline/function-count-update-3.ll
+++ b/llvm/test/Transforms/Inline/function-count-update-3.ll
@ -0,0 +1,69 @@
+; RUN: opt < %s -inline -S -inline-threshold=50 | FileCheck %s
+
+; This tests that the function count of a function gets properly scaled after 
+; inlining a call chain leading to the function.
+; Function a calls c with count 200 (C1)
+; Function b calls c with count 300
+; Function c calls e with count 250 (C2)
+; Entry count of e is 500 (C3)
+; c->e inlining does not happen since the cost exceeds threshold.
+; c then inlined into a.
+; e now gets inlined into a (through c) since the branch condition in e is now
+; known and hence the cost gets reduced.
+; Estimated count of a->e callsite = C2 * (C1 / C3)
+; Estimated count of a->e callsite = 250 * (200 / 500) = 100
+; Remaining count of e = C3 - 100 = 500 - 100 = 400
+
+@data = external global i32
+
+define i32 @a(i32 %a1) !prof !1 {
+  %a2 = call i32 @c(i32 %a1, i32 1)
+  ret i32 %a2
+}
+
+define i32 @b(i32 %b1) !prof !2 {
+  %b2 = call i32 @c(i32 %b1, i32 %b1)
+  ret i32 %b2
+}
+
+define i32 @c(i32 %c1, i32 %c100) !prof !3 {
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  ret i32 0
+
+cond_true:
+  %c11 = call i32 @e(i32 %c100)
+  ret i32 %c11
+}
+
+; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]]
+define i32 @e(i32 %c1) !prof !4 {
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  %c2 = load i32, i32* @data, align 4
+  %c3 = add i32 %c1, %c2
+  %c4 = mul i32 %c3, %c2
+  %c5 = add i32 %c4, %c2
+  %c6 = mul i32 %c5, %c2
+  %c7 = add i32 %c6, %c2
+  %c8 = mul i32 %c7, %c2
+  %c9 = add i32 %c8, %c2
+  %c10 = mul i32 %c9, %c2
+  ret i32 %c10
+
+cond_true:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400}
+!0 = !{i32 1, !"MaxFunctionCount", i32 5000}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 300}
+!3 = !{!"function_entry_count", i64 500}
+!4 = !{!"function_entry_count", i64 500}
+
--- a/llvm/test/Transforms/Inline/function-count-update.ll
+++ b/llvm/test/Transforms/Inline/function-count-update.ll
@ -0,0 +1,51 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+; RUN: opt < %s -always-inline -S | FileCheck %s
+
+; This tests that the function count of two callees get correctly updated after
+; they have been inlined into two back-to-back callsites in a single basic block
+; in the caller. The callees have the alwaysinline attribute and so they get
+; inlined both with the regular inliner pass and the always inline pass. In
+; both cases, the new count of each callee is the original count minus callsite
+; count which is 200 (since the caller's entry count is 400 and the block
+; containing the calls have a relative block frequency of 0.5).
+
+; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]]
+define i32 @callee1(i32 %n) #0 !prof !1 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_true:
+  %r1 = add i32 %n, 1
+  ret i32 %r1
+cond_false:
+  %r2 = add i32 %n, 2
+  ret i32 %r2
+}
+
+; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]]
+define i32 @callee2(i32 %n) #0 !prof !2 {
+  %r1 = add i32 %n, 1
+  ret i32 %r1
+}
+
+define i32 @caller(i32 %n) !prof !3 {
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_true:
+  %i = call i32 @callee1(i32 %n)
+  %j = call i32 @callee2(i32 %i)
+  ret i32 %j
+cond_false:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800}
+; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800}
+!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"function_entry_count", i64 2000}
+!3 = !{!"function_entry_count", i64 400}
+attributes #0 = { alwaysinline }
+