Split FrameAnalysis and improve LivenessAnalysis

Summary: Split FrameAnalysis into FrameAnalysis and RegAnalysis, since some optimizations only require register information about functions, not frame information. Refactor callgraph walking code into the CallGraphWalker class, allowing any analysis that depend on the call graph to easily traverse it via a visitor pattern. Also fix LivenessAnalysis, which was broken because it was not considering registers read into callees and incorporating this into caller. (cherry picked from FBD5177901)
2017-06-02 16:57:22 -07:00 · 2017-06-02 16:57:22 -07:00 · 2c23094299
parent d850ca3622
commit 2c23094299
19 changed files with 584 additions and 317 deletions
--- a/bolt/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/Passes/BinaryFunctionCallGraph.cpp
@ -12,6 +12,7 @@
 #include "BinaryFunctionCallGraph.h"
 #include "BinaryFunction.h"
 #include "BinaryContext.h"
+#include "llvm/Support/Timer.h"

 #define DEBUG_TYPE "callgraph"

@ -30,6 +31,7 @@ CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF,
 }

 std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
+  NamedRegionTimer T1("Build cg traversal order", "CG breakdown", true);
  std::deque<BinaryFunction *> TopologicalOrder;
  enum NodeStatus { NEW, VISITING, VISITED };
  std::vector<NodeStatus> NodeStatus(Funcs.size());
@ -73,6 +75,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
                                       bool IncludeColdCalls,
                                       bool UseFunctionHotSize,
                                       bool UseEdgeCounts) {
+  NamedRegionTimer T1("Callgraph construction", "CG breakdown", true);
  BinaryFunctionCallGraph Cg;

  // Add call graph nodes.
--- a/bolt/Passes/CMakeLists.txt
+++ b/bolt/Passes/CMakeLists.txt
@ -3,6 +3,7 @@ add_llvm_library(LLVMBOLTPasses
  BinaryPasses.cpp
  BinaryFunctionCallGraph.cpp
  CallGraph.cpp
+  CallGraphWalker.cpp
  DataflowAnalysis.cpp
  DataflowInfoManager.cpp
  FrameAnalysis.cpp
@ -13,6 +14,7 @@ add_llvm_library(LLVMBOLTPasses
  Inliner.cpp
  LivenessAnalysis.cpp
  PettisAndHansen.cpp
+  RegAnalysis.cpp
  ReorderAlgorithm.cpp
  ReorderFunctions.cpp
  ShrinkWrapping.cpp
--- a/bolt/Passes/CallGraphWalker.cpp
+++ b/bolt/Passes/CallGraphWalker.cpp
@ -0,0 +1,46 @@
+#include "CallGraphWalker.h"
+#include "llvm/Support/Timer.h"
+
+namespace llvm {
+namespace bolt {
+
+void CallGraphWalker::traverseCG() {
+  NamedRegionTimer T1("CG Traversal", "CG breakdown", true);
+  std::queue<BinaryFunction *> Queue;
+  std::set<BinaryFunction *> InQueue;
+
+  for (auto *Func : TopologicalCGOrder) {
+    Queue.push(Func);
+    InQueue.insert(Func);
+  }
+
+  while (!Queue.empty()) {
+    auto *Func = Queue.front();
+    Queue.pop();
+    InQueue.erase(Func);
+
+    bool Changed{false};
+    for (auto Visitor : Visitors) {
+      bool CurVisit = Visitor(Func);
+      Changed = Changed || CurVisit;
+    }
+
+    if (Changed) {
+      for (auto CallerID : CG.predecessors(CG.getNodeId(Func))) {
+        BinaryFunction *CallerFunc = CG.nodeIdToFunc(CallerID);
+        if (InQueue.count(CallerFunc))
+          continue;
+        Queue.push(CallerFunc);
+        InQueue.insert(CallerFunc);
+      }
+    }
+  }
+}
+
+void CallGraphWalker::walk() {
+  TopologicalCGOrder = CG.buildTraversalOrder();
+  traverseCG();
+}
+
+}
+}
--- a/bolt/Passes/CallGraphWalker.h
+++ b/bolt/Passes/CallGraphWalker.h
@ -0,0 +1,67 @@
+//===--- Passes/CallGraphWalker.h -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPHWALKER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPHWALKER_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "BinaryFunctionCallGraph.h"
+#include <queue>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+
+/// Perform a bottom-up walk of the call graph with the intent of computing
+/// a property that depends on callees. In the event of a CG cycles, this will
+/// re-visit functions until their observed property converges.
+class CallGraphWalker {
+  BinaryContext &BC;
+  std::map<uint64_t, BinaryFunction> &BFs;
+  BinaryFunctionCallGraph &CG;
+
+  /// DFS or reverse post-ordering of the call graph nodes to allow us to
+  /// traverse the call graph bottom-up
+  std::deque<BinaryFunction *> TopologicalCGOrder;
+
+  /// Stores all visitor functions to call when traversing the call graph
+  typedef std::function<bool(BinaryFunction*)> CallbackTy;
+  std::vector<CallbackTy> Visitors;
+
+  /// Do the bottom-up traversal
+  void traverseCG();
+
+public:
+  /// Initialize core context references but don't do anything yet
+  CallGraphWalker(BinaryContext &BC, std::map<uint64_t, BinaryFunction> &BFs,
+                  BinaryFunctionCallGraph &CG)
+      : BC(BC), BFs(BFs), CG(CG) {}
+
+  /// Register a new callback function to be called for each function when
+  /// traversing the call graph bottom-up. Function should return true iff
+  /// whatever information it is keeping track of has changed. Function must
+  /// converge with time, ie, it must eventually return false, otherwise the
+  /// call graph walk will never finish.
+  void registerVisitor(CallbackTy Callback) {
+    Visitors.emplace_back(Callback);
+  }
+
+  /// Build the call graph, establish a traversal order and traverse it.
+  void walk();
+};
+
+}
+}
+
+#endif
--- a/bolt/Passes/DataflowInfoManager.cpp
+++ b/bolt/Passes/DataflowInfoManager.cpp
@ -18,8 +18,8 @@ namespace bolt {
 ReachingDefOrUse</*Def=*/true> &DataflowInfoManager::getReachingDefs() {
  if (RD)
    return *RD;
-  assert(FA && "FrameAnalysis required");
-  RD.reset(new ReachingDefOrUse<true>(*FA, BC, BF));
+  assert(RA && "RegAnalysis required");
+  RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF));
  RD->run();
  return *RD;
 }
@ -31,8 +31,8 @@ void DataflowInfoManager::invalidateReachingDefs() {
 ReachingDefOrUse</*Def=*/false> &DataflowInfoManager::getReachingUses() {
  if (RU)
    return *RU;
-  assert(FA && "FrameAnalysis required");
-  RU.reset(new ReachingDefOrUse<false>(*FA, BC, BF));
+  assert(RA && "RegAnalysis required");
+  RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF));
  RU->run();
  return *RU;
 }
@ -44,8 +44,8 @@ void DataflowInfoManager::invalidateReachingUses() {
 LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() {
  if (LA)
    return *LA;
-  assert(FA && "FrameAnalysis required");
-  LA.reset(new LivenessAnalysis(*FA, BC, BF));
+  assert(RA && "RegAnalysis required");
+  LA.reset(new LivenessAnalysis(*RA, BC, BF));
  LA->run();
  return *LA;
 }
--- a/bolt/Passes/DataflowInfoManager.h
+++ b/bolt/Passes/DataflowInfoManager.h
@ -12,14 +12,15 @@
 #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H

-#include "FrameAnalysis.h"
-#include "ReachingDefOrUse.h"
-#include "StackReachingUses.h"
 #include "DominatorAnalysis.h"
-#include "StackPointerTracking.h"
-#include "ReachingInsns.h"
+#include "FrameAnalysis.h"
 #include "LivenessAnalysis.h"
+#include "ReachingDefOrUse.h"
+#include "ReachingInsns.h"
+#include "RegAnalysis.h"
 #include "StackAllocationAnalysis.h"
+#include "StackPointerTracking.h"
+#include "StackReachingUses.h"

 namespace llvm {
 namespace bolt {
@ -29,6 +30,7 @@ namespace bolt {
 /// recompute it. Also provide an interface for data invalidation when the
 /// analysis is outdated after a transform pass modified the function.
 class DataflowInfoManager {
+  const RegAnalysis *RA;
  const FrameAnalysis *FA;
  const BinaryContext &BC;
  BinaryFunction &BF;
@ -46,8 +48,9 @@ class DataflowInfoManager {
      InsnToBB;

 public:
-  DataflowInfoManager(const FrameAnalysis *FA, const BinaryContext &BC,
-                      BinaryFunction &BF) : FA(FA), BC(BC), BF(BF) {};
+  DataflowInfoManager(const BinaryContext &BC, BinaryFunction &BF,
+                      const RegAnalysis *RA, const FrameAnalysis *FA)
+      : RA(RA), FA(FA), BC(BC), BF(BF){};

  /// Helper function to fetch the parent BB associated with a program point
  /// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock)
--- a/bolt/Passes/FrameAnalysis.cpp
+++ b/bolt/Passes/FrameAnalysis.cpp
@ -9,6 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "FrameAnalysis.h"
+#include "CallGraphWalker.h"
 #include <fstream>

 #define DEBUG_TYPE "fa"
@ -213,9 +214,8 @@ public:

 } // end anonymous namespace

-void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst,
-                                      ArgAccesses &&AA) {
-  if (auto OldAA = getArgAccessesFor(BC, Inst)) {
+void FrameAnalysis::addArgAccessesFor(MCInst &Inst, ArgAccesses &&AA) {
+  if (auto OldAA = getArgAccessesFor(Inst)) {
    if (OldAA->AssumeEverything)
      return;
    *OldAA = std::move(AA);
@ -231,13 +231,12 @@ void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst,
  ArgAccessesVector.emplace_back(std::move(AA));
 }

-void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC,
-                                           MCInst &Inst,
+void FrameAnalysis::addArgInStackAccessFor(MCInst &Inst,
                                           const ArgInStackAccess &Arg) {
-  auto AA = getArgAccessesFor(BC, Inst);
+  auto AA = getArgAccessesFor(Inst);
  if (!AA) {
-    addArgAccessesFor(BC, Inst, ArgAccesses(false));
-    AA = getArgAccessesFor(BC, Inst);
+    addArgAccessesFor(Inst, ArgAccesses(false));
+    AA = getArgAccessesFor(Inst);
    assert(AA && "Object setup failed");
  }
  auto &Set = AA->Set;
@ -245,15 +244,13 @@ void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC,
  Set.emplace(Arg);
 }

-void FrameAnalysis::addFIEFor(const BinaryContext &BC, MCInst &Inst,
-                              const FrameIndexEntry &FIE) {
+void FrameAnalysis::addFIEFor(MCInst &Inst, const FrameIndexEntry &FIE) {
  BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "FrameAccessEntry",
                        (unsigned)FIEVector.size());
  FIEVector.emplace_back(FIE);
 }

-ErrorOr<ArgAccesses &>
-FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, const MCInst &Inst) {
+ErrorOr<ArgAccesses &> FrameAnalysis::getArgAccessesFor(const MCInst &Inst) {
  if (auto Idx = BC.MIA->tryGetAnnotationAs<unsigned>(Inst, "ArgAccessEntry")) {
    assert(ArgAccessesVector.size() > *Idx && "Out of bounds");
    return ArgAccessesVector[*Idx];
@ -262,8 +259,7 @@ FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, const MCInst &Inst) {
 }

 ErrorOr<const ArgAccesses &>
-FrameAnalysis::getArgAccessesFor(const BinaryContext &BC,
-                                 const MCInst &Inst) const {
+FrameAnalysis::getArgAccessesFor(const MCInst &Inst) const {
  if (auto Idx = BC.MIA->tryGetAnnotationAs<unsigned>(Inst, "ArgAccessEntry")) {
    assert(ArgAccessesVector.size() > *Idx && "Out of bounds");
    return ArgAccessesVector[*Idx];
@ -272,7 +268,7 @@ FrameAnalysis::getArgAccessesFor(const BinaryContext &BC,
 }

 ErrorOr<const FrameIndexEntry &>
-FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const {
+FrameAnalysis::getFIEFor(const MCInst &Inst) const {
  if (auto Idx =
          BC.MIA->tryGetAnnotationAs<unsigned>(Inst, "FrameAccessEntry")) {
    assert(FIEVector.size() > *Idx && "Out of bounds");
@ -281,130 +277,17 @@ FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const {
  return make_error_code(errc::result_out_of_range);
 }

-void FrameAnalysis::getInstClobberList(const BinaryContext &BC,
-                                       const MCInst &Inst,
-                                       BitVector &KillSet) const {
-  if (!BC.MIA->isCall(Inst)) {
-    BC.MIA->getClobberedRegs(Inst, KillSet, *BC.MRI);
-    return;
-  }
+void FrameAnalysis::traverseCG(BinaryFunctionCallGraph &CG) {
+  CallGraphWalker CGWalker(BC, BFs, CG);

-  const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
-  // If indirect call, kill set should have all elements
-  if (TargetSymbol == nullptr) {
-    KillSet.set(0, KillSet.size());
-    return;
-  }
+  CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool {
+    return computeArgsAccessed(*Func);
+  });

-  const auto *Function = BC.getFunctionForSymbol(TargetSymbol);
-  if (Function == nullptr) {
-    // Call to a function without a BinaryFunction object.
-    // This should be a call to a PLT entry, and since it is a trampoline to
-    // a DSO, we can't really know the code in advance. Conservatively assume
-    // everything is clobbered.
-    KillSet.set(0, KillSet.size());
-    return;
-  }
-  auto BV = RegsKilledMap.find(Function);
-  if (BV != RegsKilledMap.end()) {
-    KillSet |= BV->second;
-    return;
-  }
-  // Ignore calls to function whose clobber list wasn't yet calculated. This
-  // instruction will be evaluated again once we have info for the callee.
-  return;
+  CGWalker.walk();
 }

-BitVector FrameAnalysis::getFunctionClobberList(const BinaryContext &BC,
-                                                const BinaryFunction *Func) {
-  BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false);
-
-  if (!Func->isSimple() || !Func->hasCFG()) {
-    RegsKilled.set(0, RegsKilled.size());
-    return RegsKilled;
-  }
-
-  for (const auto &BB : *Func) {
-    for (const auto &Inst : BB) {
-      getInstClobberList(BC, Inst, RegsKilled);
-    }
-  }
-
-  return RegsKilled;
-}
-
-void FrameAnalysis::buildClobberMap(const BinaryContext &BC) {
-  std::queue<BinaryFunction *> Queue;
-  std::set<BinaryFunction *> InQueue;
-
-  for (auto *Func : TopologicalCGOrder) {
-    Queue.push(Func);
-    InQueue.insert(Func);
-  }
-
-  while (!Queue.empty()) {
-    auto *Func = Queue.front();
-    Queue.pop();
-    InQueue.erase(Func);
-
-    BitVector RegsKilled = getFunctionClobberList(BC, Func);
-    bool ArgsUpdated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func);
-    bool RegsUpdated = false;
-
-    if (RegsKilledMap.find(Func) == RegsKilledMap.end()) {
-      RegsKilledMap[Func] = std::move(RegsKilled);
-    } else {
-      RegsUpdated = RegsKilledMap[Func] != RegsKilled;
-      if (RegsUpdated)
-        RegsKilledMap[Func] = std::move(RegsKilled);
-    }
-
-    if (RegsUpdated || ArgsUpdated) {
-      for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) {
-        BinaryFunction *CallerFunc = Cg.nodeIdToFunc(Caller);
-        if (!InQueue.count(CallerFunc)) {
-          InQueue.insert(CallerFunc);
-          Queue.push(CallerFunc);
-        }
-      }
-    }
-  }
-
-  if (opts::Verbosity == 0) {
-#ifndef NDEBUG
-    if (!DebugFlag || !isCurrentDebugType("fa"))
-      return;
-#else
-    return;
-#endif
-  }
-
-  // This loop is for computing statistics only
-  for (auto *Func : TopologicalCGOrder) {
-    auto Iter = RegsKilledMap.find(Func);
-    assert(Iter != RegsKilledMap.end() &&
-           "Failed to compute all clobbers list");
-    if (Iter->second.all()) {
-      auto Count = Func->getExecutionCount();
-      if (Count != BinaryFunction::COUNT_NO_PROFILE)
-        CountFunctionsAllClobber += Count;
-      ++NumFunctionsAllClobber;
-    }
-    DEBUG_WITH_TYPE("fa",
-      dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n";
-      const BitVector &RegsKilled = Iter->second;
-      int RegIdx = RegsKilled.find_first();
-      while (RegIdx != -1) {
-        dbgs() << "\tREG" << RegIdx;
-        RegIdx = RegsKilled.find_next(RegIdx);
-      };
-      dbgs() << "\n";
-    );
-  }
-}
-
-bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC,
-                                         const BinaryFunction &BF, MCInst &Inst,
+bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst,
                                         int CurOffset) {
  if (!BC.MIA->isCall(Inst))
    return false;
@ -413,7 +296,7 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC,
  const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
  // If indirect call, we conservatively assume it accesses all stack positions
  if (TargetSymbol == nullptr) {
-    addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true));
+    addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
    bool Updated{false};
    if (!FunctionsRequireAlignment.count(&BF)) {
      Updated = true;
@ -426,7 +309,7 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC,
  // Call to a function without a BinaryFunction object. Conservatively assume
  // it accesses all stack positions
  if (Function == nullptr) {
-    addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true));
+    addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
    bool Updated{false};
    if (!FunctionsRequireAlignment.count(&BF)) {
      Updated = true;
@ -459,27 +342,25 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC,

  if (CurOffset == StackPointerTracking::EMPTY ||
      CurOffset == StackPointerTracking::SUPERPOSITION) {
-    addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true));
+    addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
    return Changed;
  }

  for (auto Elem : Iter->second) {
    if (Elem.first == -1) {
-      addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true));
+      addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
      break;
    }
    DEBUG(dbgs() << "Added arg in stack access annotation "
                 << CurOffset + Elem.first << "\n");
    addArgInStackAccessFor(
-        BC, Inst,
-        ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first,
-                         /*Size=*/Elem.second});
+        Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first,
+                               /*Size=*/Elem.second});
  }
  return Changed;
 }

-bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC,
-                                        BinaryFunction &BF) {
+bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
  if (!BF.isSimple() || !BF.hasCFG()) {
    DEBUG(dbgs() << "Treating " << BF.getPrintName() << " conservatively.\n");
    bool Updated = false;
@ -505,7 +386,7 @@ bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC,

      // Check for calls -- attach stack accessing info to them regarding their
      // target
-      if (updateArgsTouchedFor(BC, BF, Inst, FAA.getSPOffset()))
+      if (updateArgsTouchedFor(BF, Inst, FAA.getSPOffset()))
        UpdatedArgsTouched = true;

      // Check for stack accesses that affect callers
@ -548,8 +429,7 @@ bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC,
  return UpdatedArgsTouched || UpdatedAlignedStatus;
 }

-bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC,
-                                      BinaryFunction &BF) {
+bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
  FrameAccessAnalysis FAA(BC, BF);

  DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
@ -572,7 +452,7 @@ bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC,

      const FrameIndexEntry &FIE = FAA.getFIE();

-      addFIEFor(BC, Inst, FIE);
+      addFIEFor(Inst, FIE);
      DEBUG({
        dbgs() << "Frame index annotation " << FIE << " added to:\n";
        BC.printInstruction(dbgs(), Inst, 0, &BF, true);
@ -582,8 +462,7 @@ bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC,
  return true;
 }

-void FrameAnalysis::cleanAnnotations(const BinaryContext &BC,
-                                     std::map<uint64_t, BinaryFunction> &BFs) {
+void FrameAnalysis::cleanAnnotations() {
  for (auto &I : BFs) {
    for (auto &BB : I.second) {
      for (auto &Inst : BB) {
@ -594,24 +473,15 @@ void FrameAnalysis::cleanAnnotations(const BinaryContext &BC,
  }
 }

-void FrameAnalysis::runOnFunctions(BinaryContext &BC,
-                                   std::map<uint64_t, BinaryFunction> &BFs,
-                                   std::set<uint64_t> &) {
-  {
-    NamedRegionTimer T1("Callgraph construction", "FOP breakdown", true);
-    Cg = buildCallGraph(BC, BFs);
-  }
-  {
-    NamedRegionTimer T1("build cg traversal order", "FOP breakdown", true);
-    TopologicalCGOrder = Cg.buildTraversalOrder();
-  }
-  {
-    NamedRegionTimer T1("build clobber map", "FOP breakdown", true);
-    buildClobberMap(BC);
-  }
+FrameAnalysis::FrameAnalysis(BinaryContext &BC,
+                             std::map<uint64_t, BinaryFunction> &BFs,
+                             BinaryFunctionCallGraph &CG)
+    : BC(BC), BFs(BFs) {
+  // Position 0 of the vector should be always associated with "assume access
+  // everything".
+  ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true));

-  if (ClobberAnalysisOnly)
-    return;
+  traverseCG(CG);

  for (auto &I : BFs) {
    auto Count = I.second.getExecutionCount();
@ -630,7 +500,7 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC,

    {
      NamedRegionTimer T1("restore frame index", "FOP breakdown", true);
-      if (!restoreFrameIndex(BC, I.second)) {
+      if (!restoreFrameIndex(I.second)) {
        ++NumFunctionsFailedRestoreFI;
        auto Count = I.second.getExecutionCount();
        if (Count != BinaryFunction::COUNT_NO_PROFILE)
@ -643,12 +513,7 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC,
 }

 void FrameAnalysis::printStats() {
-  outs() << "BOLT-INFO FRAME ANALYSIS: Number of functions conservatively "
-            "treated as clobbering all registers: "
-         << NumFunctionsAllClobber
-         << format(" (%.1lf%% dyn cov)\n",
-                   (100.0 * CountFunctionsAllClobber / CountDenominator))
-         << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized
+  outs() << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized
         << " function(s) "
         << format("(%.1lf%% dyn cov)",
                   (100.0 * CountFunctionsNotOptimized / CountDenominator))
--- a/bolt/Passes/FrameAnalysis.h
+++ b/bolt/Passes/FrameAnalysis.h
@ -12,8 +12,8 @@
 #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H

-#include "BinaryPasses.h"
 #include "BinaryFunctionCallGraph.h"
+#include "BinaryPasses.h"
 #include "StackPointerTracking.h"

 namespace llvm {
@ -111,17 +111,9 @@ raw_ostream &operator<<(raw_ostream &OS,
 ///      ... callee may access any position of our current stack frame
 ///    }
 ///
-class FrameAnalysis : public BinaryFunctionPass {
-  /// Call graph info
-  BinaryFunctionCallGraph Cg;
-
-  /// DFS or reverse post-ordering of the call graph nodes to allow us to
-  /// traverse the call graph bottom-up
-  std::deque<BinaryFunction *> TopologicalCGOrder;
-
-  /// Map functions to the set of registers they may overwrite starting at when
-  /// it is called until it returns to the caller.
-  std::map<const BinaryFunction *, BitVector> RegsKilledMap;
+class FrameAnalysis {
+  BinaryContext &BC;
+  std::map<uint64_t, BinaryFunction> &BFs;

  /// Map functions to the set of <stack offsets, size> tuples representing
  /// accesses to stack positions that belongs to caller
@ -142,70 +134,44 @@ class FrameAnalysis : public BinaryFunctionPass {
  std::vector<FrameIndexEntry> FIEVector;

  /// Analysis stats counters
-  uint64_t NumFunctionsAllClobber{0};
-  uint64_t CountFunctionsAllClobber{0};
  uint64_t NumFunctionsNotOptimized{0};
  uint64_t NumFunctionsFailedRestoreFI{0};
  uint64_t CountFunctionsNotOptimized{0};
  uint64_t CountFunctionsFailedRestoreFI{0};
  uint64_t CountDenominator{0};

-  /// If this flag is set to true, the analysis will never run completely,
-  /// but will stop after callgraph and a clobber analysis for every function
-  /// has been computed.
-  bool ClobberAnalysisOnly{false};
-
  /// Convenience functions for appending MCAnnotations to instructions with
  /// our specific data
-  void addArgAccessesFor(const BinaryContext &BC, MCInst &Inst,
-                         ArgAccesses &&AA);
-  void addArgInStackAccessFor(const BinaryContext &BC, MCInst &Inst,
-                              const ArgInStackAccess &Arg);
-  void addFIEFor(const BinaryContext &BC, MCInst &Inst,
-                 const FrameIndexEntry &FIE);
-
-  /// Compute the set of registers \p Func may write to during its execution,
-  /// starting at the point when it is called up until when it returns. Returns
-  /// a BitVector the size of the target number of registers, representing the
-  /// set of clobbered registers.
-  BitVector getFunctionClobberList(const BinaryContext &BC,
-                                   const BinaryFunction *Func);
+  void addArgAccessesFor(MCInst &Inst, ArgAccesses &&AA);
+  void addArgInStackAccessFor(MCInst &Inst, const ArgInStackAccess &Arg);
+  void addFIEFor(MCInst &Inst, const FrameIndexEntry &FIE);

  /// Perform the step of building the set of registers clobbered by each
-  /// function execution, populating RegsKilledMap.
-  void buildClobberMap(const BinaryContext &BC);
+  /// function execution, populating RegsKilledMap and RegsGenMap.
+  void traverseCG(BinaryFunctionCallGraph &CG);

  /// Analyzes an instruction and if it is a call, checks the called function
  /// to record which args in stack are accessed, if any. Returns true if
  /// the args data associated with this instruction were updated.
-  bool updateArgsTouchedFor(const BinaryContext &BC, const BinaryFunction &BF,
-                            MCInst &Inst, int CurOffset);
+  bool updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst,
+                            int CurOffset);

  /// Performs a pass over \p BF to check for accesses to arguments in stack,
  /// flagging those as accessing the caller stack frame. All functions called
  /// by \p BF must have been previously analyzed. Returns true if updated
  /// args data about this function.
-  bool computeArgsAccessed(const BinaryContext &BC, BinaryFunction &BF);
+  bool computeArgsAccessed(BinaryFunction &BF);

  /// Alias analysis to disambiguate which frame position is accessed by each
  /// instruction in function \p BF. Add MCAnnotation<FrameIndexEntry> to
  /// instructions that access a frame position. Return false if it failed
  /// to analyze and this information can't be safely determined for \p BF.
-  bool restoreFrameIndex(const BinaryContext &BC, BinaryFunction &BF);
+  bool restoreFrameIndex(BinaryFunction &BF);

 public:
-  explicit FrameAnalysis(const cl::opt<bool> &PrintPass,
-                         bool ClobberAnalysisOnly=false)
-      : BinaryFunctionPass(PrintPass),
-        ClobberAnalysisOnly(ClobberAnalysisOnly) {
-    // Position 0 of the vector should be always associated with "assume access
-    // everything".
-    ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true));
-  }
-
-  const char *getName() const override {
-    return "frame-analysis";
-  }
+  explicit FrameAnalysis(BinaryContext &BC,
+                         std::map<uint64_t, BinaryFunction> &BFs,
+                         BinaryFunctionCallGraph &CG);

  /// Return true if we could fully analyze \p Func
  bool hasFrameInfo(const BinaryFunction &Func) const {
@ -217,30 +183,19 @@ public:
    return FunctionsRequireAlignment.count(&Func);
  }

-  /// Compute the set of registers \p Inst may write to, marking them in
-  /// \p KillSet. If this is a call, try to get the set of registers the call
-  /// target will write to.
-  void getInstClobberList(const BinaryContext &BC, const MCInst &Inst,
-                          BitVector &KillSet) const;
-
  /// Functions for retrieving our specific MCAnnotation data from instructions
-  ErrorOr<ArgAccesses &> getArgAccessesFor(const BinaryContext &BC,
-                                           const MCInst &Inst);
+  ErrorOr<ArgAccesses &> getArgAccessesFor(const MCInst &Inst);

-  ErrorOr<const ArgAccesses &> getArgAccessesFor(const BinaryContext &BC,
-                                                 const MCInst &Inst) const;
+  ErrorOr<const ArgAccesses &> getArgAccessesFor(const MCInst &Inst) const;

-  ErrorOr<const FrameIndexEntry &> getFIEFor(const BinaryContext &BC,
-                                             const MCInst &Inst) const;
-
-  /// Pass entry point
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
+  ErrorOr<const FrameIndexEntry &> getFIEFor(const MCInst &Inst) const;

  /// Remove all MCAnnotations attached by this pass
-  void cleanAnnotations(const BinaryContext &BC,
-                        std::map<uint64_t, BinaryFunction> &BFs);
+  void cleanAnnotations();
+
+  ~FrameAnalysis() {
+    cleanAnnotations();
+  }


  /// Print to standard output statistics about the analysis performed by this
--- a/bolt/Passes/FrameOptimizer.cpp
+++ b/bolt/Passes/FrameOptimizer.cpp
@ -10,7 +10,6 @@
 //===----------------------------------------------------------------------===//

 #include "FrameOptimizer.h"
-#include "FrameAnalysis.h"
 #include "ShrinkWrapping.h"
 #include "StackAvailableExpressions.h"
 #include "StackReachingUses.h"
@ -45,10 +44,11 @@ FrameOptimization("frame-opt",
 namespace llvm {
 namespace bolt {

-void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA,
+void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
+                                                const FrameAnalysis &FA,
                                                const BinaryContext &BC,
                                                BinaryFunction &BF) {
-  StackAvailableExpressions SAE(FA, BC, BF);
+  StackAvailableExpressions SAE(RA, FA, BC, BF);
  SAE.run();

  DEBUG(dbgs() << "Performing unnecessary loads removal\n");
@ -71,7 +71,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA,
      // if Inst is a load from stack and the current available expressions show
      // this value is available in a register or immediate, replace this load
      // with move from register or from immediate.
-      auto FIEX = FA.getFIEFor(BC, Inst);
+      auto FIEX = FA.getFIEFor(Inst);
      if (!FIEX) {
        Prev = &Inst;
        continue;
@ -88,7 +88,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA,
      for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
           I != ExprEnd; ++I) {
        const MCInst *AvailableInst = *I;
-        auto FIEY = FA.getFIEFor(BC, *AvailableInst);
+        auto FIEY = FA.getFIEFor(*AvailableInst);
        if (!FIEY)
          continue;
        assert(FIEY->IsStore && FIEY->IsSimple);
@ -172,7 +172,7 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
          (*I)->dump();
        }
      });
-      auto FIEX = FA.getFIEFor(BC, Inst);
+      auto FIEX = FA.getFIEFor(Inst);
      if (!FIEX) {
        Prev = &Inst;
        continue;
@ -217,8 +217,9 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
    return;

  // Run FrameAnalysis pass
-  FrameAnalysis FA(PrintPass);
-  FA.runOnFunctions(BC, BFs, LargeFunctions);
+  BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs);
+  FrameAnalysis FA(BC, BFs, CG);
+  RegAnalysis RA(BC, BFs, CG);

  // Our main loop: perform caller-saved register optimizations, then
  // callee-saved register optimizations (shrink wrapping).
@ -237,7 +238,7 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
    }
    {
      NamedRegionTimer T1("remove loads", "FOP breakdown", true);
-      removeUnnecessaryLoads(FA, BC, I.second);
+      removeUnnecessaryLoads(RA, FA, BC, I.second);
    }
    {
      NamedRegionTimer T1("remove stores", "FOP breakdown", true);
@ -248,14 +249,12 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
      continue;
    {
      NamedRegionTimer T1("move spills", "FOP breakdown", true);
-      DataflowInfoManager Info(&FA, BC, I.second);
+      DataflowInfoManager Info(BC, I.second, &RA, &FA);
      ShrinkWrapping SW(FA, BC, I.second, Info);
      SW.perform();
    }
  }

-  FA.cleanAnnotations(BC, BFs);
-
  outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
         << " redundant load(s) and " << NumRedundantStores
         << " unused store(s)\n";
--- a/bolt/Passes/FrameOptimizer.h
+++ b/bolt/Passes/FrameOptimizer.h
@ -14,6 +14,7 @@

 #include "BinaryPasses.h"
 #include "FrameAnalysis.h"
+#include "RegAnalysis.h"

 namespace llvm {
 namespace bolt {
@ -86,7 +87,8 @@ class FrameOptimizerPass : public BinaryFunctionPass {
  /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from
  /// the frame. Use the analysis to convert memory loads to register moves or
  /// immediate loads. Delete redundant register moves.
-  void removeUnnecessaryLoads(const FrameAnalysis &FA,
+  void removeUnnecessaryLoads(const RegAnalysis &RA,
+                              const FrameAnalysis &FA,
                              const BinaryContext &BC,
                              BinaryFunction &BF);

--- a/bolt/Passes/IndirectCallPromotion.cpp
+++ b/bolt/Passes/IndirectCallPromotion.cpp
@ -679,9 +679,12 @@ void IndirectCallPromotion::runOnFunctions(
  if (opts::IndirectCallPromotion == ICP_NONE)
    return;

-  FrameAnalysis FA(PrintPass, /*ClobberAnalysisOnly=*/true);
-  if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES)
-    FA.runOnFunctions(BC, BFs, LargeFunctions);
+  std::unique_ptr<RegAnalysis> RA;
+  std::unique_ptr<BinaryFunctionCallGraph> CG;
+  if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) {
+    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs)));
+    RA.reset(new RegAnalysis(BC, BFs, *CG));
+  }

  for (auto &BFIt : BFs) {
    auto &Function = BFIt.second;
@ -716,7 +719,7 @@ void IndirectCallPromotion::runOnFunctions(
    if (BBs.empty())
      continue;

-    DataflowInfoManager Info(&FA, BC, Function);
+    DataflowInfoManager Info(BC, Function, RA.get(), nullptr);
    while (!BBs.empty()) {
      auto *BB = BBs.back();
      BBs.pop_back();
@ -864,9 +867,6 @@ void IndirectCallPromotion::runOnFunctions(
    TotalIndirectJmps += FuncTotalIndirectJmps;
  }

-  if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES)
-    FA.cleanAnnotations(BC, BFs);
-
  outs() << "BOLT-INFO: ICP total indirect callsites = "
         << TotalIndirectCallsites
         << "\n"
--- a/bolt/Passes/LivenessAnalysis.h
+++ b/bolt/Passes/LivenessAnalysis.h
@ -13,9 +13,14 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_LIVENESSANALYSIS_H

 #include "DataflowAnalysis.h"
-#include "FrameAnalysis.h"
+#include "RegAnalysis.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Timer.h"

+namespace opts {
+extern llvm::cl::opt<bool> AssumeABI;
+}
+
 namespace llvm {
 namespace bolt {

@ -24,9 +29,9 @@ class LivenessAnalysis
  friend class DataflowAnalysis<LivenessAnalysis, BitVector, true>;

 public:
-  LivenessAnalysis(const FrameAnalysis &FA, const BinaryContext &BC,
+  LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC,
                   BinaryFunction &BF)
-      : DataflowAnalysis<LivenessAnalysis, BitVector, true>(BC, BF), FA(FA),
+      : DataflowAnalysis<LivenessAnalysis, BitVector, true>(BC, BF), RA(RA),
        NumRegs(BC.MRI->getNumRegs()) {}
  virtual ~LivenessAnalysis();

@ -42,9 +47,21 @@ public:
    DataflowAnalysis<LivenessAnalysis, BitVector, true>::run();
  }

+  // Return a usable general-purpose reg after point P. Return 0 if no reg is
+  // available.
+  MCPhysReg scavengeRegAfter(ProgramPoint P) {
+    BitVector BV = *this->getStateAt(P);
+    BV.flip();
+    BitVector GPRegs(NumRegs, false);
+    this->BC.MIA->getGPRegs(GPRegs, *this->BC.MRI);
+    BV &= GPRegs;
+    int Reg = BV.find_first();
+    return Reg != -1 ? Reg : 0;
+  }
+
 protected:
-  /// Reference to the result of stack frame analysis
-  const FrameAnalysis &FA;
+  /// Reference to the result of reg analysis
+  const RegAnalysis &RA;
  const uint16_t NumRegs;

  void preflight() {}
@ -63,18 +80,34 @@ protected:

  BitVector computeNext(const MCInst &Point, const BitVector &Cur) {
    BitVector Next = Cur;
+    bool IsCall = this->BC.MIA->isCall(Point);
    // Kill
    auto Written = BitVector(NumRegs, false);
-    if (this->BC.MIA->isCall(Point))
-      FA.getInstClobberList(this->BC, Point, Written);
-    else
+    if (!IsCall) {
      this->BC.MIA->getWrittenRegs(Point, Written, *this->BC.MRI);
+    } else {
+      RA.getInstClobberList(Point, Written);
+      // When clobber list is conservative, it is clobbering all/most registers,
+      // a conservative estimate because it knows nothing about this call.
+      // For our purposes, assume it kills no registers/callee-saved regs
+      // because we don't really know what's going on.
+      if (RA.isConservative(Written)) {
+        Written.reset();
+        BC.MIA->getCalleeSavedRegs(Written, *this->BC.MRI);
+      }
+    }
    Written.flip();
    Next &= Written;
    // Gen
    if (!this->BC.MIA->isCFI(Point)) {
      auto Used = BitVector(NumRegs, false);
-      this->BC.MIA->getUsedRegs(Point, Used, *this->BC.MRI);
+      RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/false);
+      if (IsCall &&
+          (!BC.MIA->isTailCall(Point) || !BC.MIA->isConditionalBranch(Point))) {
+        // Never gen FLAGS from a non-conditional call... this is overly
+        // conservative
+        Used.reset(BC.MIA->getFlagsReg());
+      }
      Next |= Used;
    }
    return Next;
--- a/bolt/Passes/ReachingDefOrUse.h
+++ b/bolt/Passes/ReachingDefOrUse.h
@ -13,6 +13,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H

 #include "DataflowAnalysis.h"
+#include "RegAnalysis.h"
 #include "llvm/Support/Timer.h"

 namespace llvm {
@ -28,16 +29,16 @@ class ReachingDefOrUse
  friend class DataflowAnalysis<ReachingDefOrUse<Def>, BitVector, !Def>;

 public:
-  ReachingDefOrUse(const FrameAnalysis &FA, const BinaryContext &BC,
+  ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC,
                   BinaryFunction &BF)
-      : InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF), FA(FA) {}
+      : InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF), RA(RA) {}
  virtual ~ReachingDefOrUse() {}

  bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) {
    for (auto I = Candidates; I != this->expr_end(); ++I) {
      auto BV = BitVector(this->BC.MRI->getNumRegs(), false);
      if (Def) {
-        FA.getInstClobberList(this->BC, **I, BV);
+        RA.getInstClobberList(**I, BV);
      } else {
        this->BC.MIA->getTouchedRegs(**I, BV, *this->BC.MRI);
      }
@ -57,8 +58,8 @@ public:
  }

 protected:
-  /// Reference to the result of stack frame analysis
-  const FrameAnalysis &FA;
+  /// Reference to the result of reg analysis
+  const RegAnalysis &RA;

  void preflight() {
    // Populate our universe of tracked expressions with all instructions
@ -89,11 +90,11 @@ protected:
    // getClobberedRegs for X and Y. If they intersect, return true
    auto XClobbers = BitVector(this->BC.MRI->getNumRegs(), false);
    auto YClobbers = BitVector(this->BC.MRI->getNumRegs(), false);
-    FA.getInstClobberList(this->BC, *X, XClobbers);
+    RA.getInstClobberList(*X, XClobbers);
    // In defs, write after write -> kills first write
    // In uses, write after access (read or write) -> kills access
    if (Def)
-      FA.getInstClobberList(this->BC, *Y, YClobbers);
+      RA.getInstClobberList(*Y, YClobbers);
    else
      this->BC.MIA->getTouchedRegs(*Y, YClobbers, *this->BC.MRI);
    // X kills Y if it clobbers Y completely -- this is a conservative approach.
--- a/bolt/Passes/RegAnalysis.cpp
+++ b/bolt/Passes/RegAnalysis.cpp
@ -0,0 +1,207 @@
+#include "RegAnalysis.h"
+#include "CallGraphWalker.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "ra"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::opt<unsigned> Verbosity;
+extern cl::OptionCategory BoltOptCategory;
+
+cl::opt<bool> AssumeABI(
+    "assume-abi",
+    cl::desc("assume the ABI is never violated"),
+    cl::ZeroOrMore,
+    cl::init(false),
+    cl::cat(BoltOptCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+RegAnalysis::RegAnalysis(BinaryContext &BC,
+                         std::map<uint64_t, BinaryFunction> &BFs,
+                         BinaryFunctionCallGraph &CG)
+    : BC(BC) {
+  CallGraphWalker CGWalker(BC, BFs, CG);
+
+  CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool {
+    BitVector RegsKilled = getFunctionClobberList(Func);
+    bool Updated = RegsKilledMap.find(Func) == RegsKilledMap.end() ||
+                   RegsKilledMap[Func] != RegsKilled;
+    if (Updated)
+      RegsKilledMap[Func] = std::move(RegsKilled);
+    return Updated;
+  });
+
+  CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool {
+    BitVector RegsGen = getFunctionUsedRegsList(Func);
+    bool Updated = RegsGenMap.find(Func) == RegsGenMap.end() ||
+                   RegsGenMap[Func] != RegsGen;
+    if (Updated)
+      RegsGenMap[Func] = std::move(RegsGen);
+    return Updated;
+  });
+
+  CGWalker.walk();
+
+  if (opts::Verbosity == 0) {
+#ifndef NDEBUG
+    if (!DebugFlag || !isCurrentDebugType(DEBUG_TYPE))
+      return;
+#else
+    return;
+#endif
+  }
+
+  // This loop is for computing statistics only
+  for (auto &MapEntry : BFs) {
+    auto *Func = &MapEntry.second;
+    auto Iter = RegsKilledMap.find(Func);
+    assert(Iter != RegsKilledMap.end() &&
+           "Failed to compute all clobbers list");
+    if (Iter->second.all()) {
+      auto Count = Func->getExecutionCount();
+      if (Count != BinaryFunction::COUNT_NO_PROFILE)
+        CountFunctionsAllClobber += Count;
+      ++NumFunctionsAllClobber;
+    }
+    DEBUG_WITH_TYPE("fa",
+      dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n";
+      const BitVector &RegsKilled = Iter->second;
+      int RegIdx = RegsKilled.find_first();
+      while (RegIdx != -1) {
+        dbgs() << "\tREG" << RegIdx;
+        RegIdx = RegsKilled.find_next(RegIdx);
+      };
+      dbgs() << "\nUsed regs set for func: " << Func->getPrintName() << "\n";
+      const BitVector &RegsUsed = RegsGenMap.find(Func)->second;
+      RegIdx = RegsUsed.find_first();
+      while (RegIdx != -1) {
+        dbgs() << "\tREG" << RegIdx;
+        RegIdx = RegsUsed.find_next(RegIdx);
+      };
+      dbgs() << "\n";
+    );
+  }
+}
+
+void RegAnalysis::beConservative(BitVector &Result) const {
+  if (!opts::AssumeABI) {
+    Result.set();
+  } else {
+    BitVector BV(BC.MRI->getNumRegs(), false);
+    BC.MIA->getCalleeSavedRegs(BV, *BC.MRI);
+    BV.flip();
+    Result |= BV;
+  }
+}
+
+bool RegAnalysis::isConservative(BitVector &Vec) const {
+  if (!opts::AssumeABI) {
+    return Vec.all();
+  } else {
+    BitVector BV(BC.MRI->getNumRegs(), false);
+    BC.MIA->getCalleeSavedRegs(BV, *BC.MRI);
+    BV |= Vec;
+    return BV.all();
+  }
+}
+
+void RegAnalysis::getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet,
+                                      bool GetClobbers) const {
+  if (!BC.MIA->isCall(Inst)) {
+    if (GetClobbers)
+      BC.MIA->getClobberedRegs(Inst, RegSet, *BC.MRI);
+    else
+      BC.MIA->getUsedRegs(Inst, RegSet, *BC.MRI);
+    return;
+  }
+
+  const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
+  // If indirect call, we know nothing
+  if (TargetSymbol == nullptr) {
+    beConservative(RegSet);
+    return;
+  }
+
+  const auto *Function = BC.getFunctionForSymbol(TargetSymbol);
+  if (Function == nullptr) {
+    // Call to a function without a BinaryFunction object.
+    // This should be a call to a PLT entry, and since it is a trampoline to
+    // a DSO, we can't really know the code in advance.
+    beConservative(RegSet);
+    return;
+  }
+  if (GetClobbers) {
+    auto BV = RegsKilledMap.find(Function);
+    if (BV != RegsKilledMap.end()) {
+      RegSet |= BV->second;
+      return;
+    }
+    // Ignore calls to function whose clobber list wasn't yet calculated. This
+    // instruction will be evaluated again once we have info for the callee.
+    return;
+  }
+  auto BV = RegsGenMap.find(Function);
+  if (BV != RegsGenMap.end()) {
+    RegSet |= BV->second;
+    return;
+  }
+}
+
+void RegAnalysis::getInstClobberList(const MCInst &Inst,
+                                     BitVector &KillSet) const {
+  return getInstUsedRegsList(Inst, KillSet, /*GetClobbers*/ true);
+}
+
+BitVector RegAnalysis::getFunctionUsedRegsList(const BinaryFunction *Func) {
+  BitVector UsedRegs = BitVector(BC.MRI->getNumRegs(), false);
+
+  if (!Func->isSimple() || !Func->hasCFG()) {
+    beConservative(UsedRegs);
+    return UsedRegs;
+  }
+
+  for (const auto &BB : *Func) {
+    for (const auto &Inst : BB) {
+      getInstUsedRegsList(Inst, UsedRegs, /*GetClobbers*/false);
+      if (UsedRegs.all())
+        return UsedRegs;
+    }
+  }
+
+  return UsedRegs;
+}
+
+BitVector RegAnalysis::getFunctionClobberList(const BinaryFunction *Func) {
+  BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false);
+
+  if (!Func->isSimple() || !Func->hasCFG()) {
+    beConservative(RegsKilled);
+    return RegsKilled;
+  }
+
+  for (const auto &BB : *Func) {
+    for (const auto &Inst : BB) {
+      getInstClobberList(Inst, RegsKilled);
+      if (RegsKilled.all())
+        return RegsKilled;
+    }
+  }
+
+  return RegsKilled;
+}
+
+void RegAnalysis::printStats() {
+  outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively "
+            "treated as clobbering all registers: "
+         << NumFunctionsAllClobber
+         << format(" (%.1lf%% dyn cov)\n",
+                   (100.0 * CountFunctionsAllClobber / CountDenominator));
+}
+
+}
+}
--- a/bolt/Passes/RegAnalysis.h
+++ b/bolt/Passes/RegAnalysis.h
@ -0,0 +1,82 @@
+//===--- Passes/RegAnalysis.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REGANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REGANALYSIS_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "BinaryFunctionCallGraph.h"
+#include "llvm/ADT/BitVector.h"
+#include <map>
+
+namespace llvm {
+namespace bolt {
+
+/// Determine the set of registers read or clobbered for each instruction
+/// in a BinaryFunction. If the instruction is a call, this analysis rely on
+/// a call graph traversal to accurately extract the set of registers touched
+/// after the call returns.
+class RegAnalysis {
+  BinaryContext &BC;
+
+  /// Map functions to the set of registers they may overwrite starting at when
+  /// it is called until it returns to the caller.
+  std::map<const BinaryFunction *, BitVector> RegsKilledMap;
+
+  /// Similar concept above but for registers that are read in that function.
+  std::map<const BinaryFunction *, BitVector> RegsGenMap;
+
+  /// Analysis stats counters
+  uint64_t NumFunctionsAllClobber{0};
+  uint64_t CountFunctionsAllClobber{0};
+  uint64_t CountDenominator{0};
+
+  /// Helper function used to get the set of clobbered/used regs whenever
+  /// we know nothing about the function.
+  void beConservative(BitVector &Result) const;
+
+  /// Compute the set of registers \p Func may read from during its execution.
+  BitVector getFunctionUsedRegsList(const BinaryFunction *Func);
+
+  /// Compute the set of registers \p Func may write to during its execution,
+  /// starting at the point when it is called up until when it returns. Returns
+  /// a BitVector the size of the target number of registers, representing the
+  /// set of clobbered registers.
+  BitVector getFunctionClobberList(const BinaryFunction *Func);
+
+public:
+  RegAnalysis(BinaryContext &BC, std::map<uint64_t, BinaryFunction> &BFs,
+              BinaryFunctionCallGraph &CG);
+
+  /// Compute the set of registers \p Inst may read from, marking them in
+  /// \p RegSet. If GetClobbers is true, the set set the instr may write to.
+  /// Use the callgraph to fill out this info for calls.
+  void getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet,
+                           bool GetClobbers) const;
+
+  /// Compute the set of registers \p Inst may write to, marking them in
+  /// \p KillSet. If this is a call, try to get the set of registers the call
+  /// target will write to.
+  void getInstClobberList(const MCInst &Inst, BitVector &KillSet) const;
+
+  /// Return true iff Vec has a conservative estimation of used/clobbered regs,
+  /// expressing no specific knowledge of reg usage.
+  bool isConservative(BitVector &Vec) const;
+
+  /// Print stats about the quality of our analysis
+  void printStats();
+};
+
+}
+}
+
+#endif
--- a/bolt/Passes/ShrinkWrapping.cpp
+++ b/bolt/Passes/ShrinkWrapping.cpp
@ -41,7 +41,7 @@ void CalleeSavedAnalysis::analyzeSaves() {
    DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
    const MCInst *Prev = nullptr;
    for (auto &Inst : BB) {
-      if (auto FIE = FA.getFIEFor(BC, Inst)) {
+      if (auto FIE = FA.getFIEFor(Inst)) {
        if (!FIE->IsStore || !FIE->IsSimple || !FIE->IsStoreFromReg ||
            FIE->StackOffset >= 0) {
          Prev = &Inst;
@ -86,7 +86,7 @@ void CalleeSavedAnalysis::analyzeRestores() {
    const MCInst *Prev = nullptr;
    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
      auto &Inst = *I;
-      if (auto FIE = FA.getFIEFor(BC, Inst)) {
+      if (auto FIE = FA.getFIEFor(Inst)) {
        if (!FIE->IsLoad || !FIE->IsSimple || !CalleeSaved[FIE->RegOrImm] ||
            FIE->StackOffset >= 0) {
          Prev = &Inst;
@ -229,7 +229,7 @@ void StackLayoutModifier::classifyStackAccesses() {
    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
      auto &Inst = *I;
      checkFramePointerInitialization(Inst);
-      auto FIEX = FA.getFIEFor(BC, Inst);
+      auto FIEX = FA.getFIEFor(Inst);
      if (!FIEX) {
        Prev = &Inst;
        continue;
@ -346,7 +346,7 @@ bool StackLayoutModifier::canCollapseRegion(MCInst *DeletedPush) {
  if (!IsSimple || !BC.MIA->isPush(*DeletedPush))
    return false;

-  auto FIE = FA.getFIEFor(BC, *DeletedPush);
+  auto FIE = FA.getFIEFor(*DeletedPush);
  if (!FIE)
    return false;

@ -370,7 +370,7 @@ bool StackLayoutModifier::canCollapseRegion(int64_t RegionAddr) {
 }

 bool StackLayoutModifier::collapseRegion(MCInst *DeletedPush) {
-  auto FIE = FA.getFIEFor(BC, *DeletedPush);
+  auto FIE = FA.getFIEFor(*DeletedPush);
  if (!FIE)
    return false;
  int64_t RegionAddr = FIE->StackOffset;
@ -414,7 +414,7 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr,
        continue;
      }

-      auto FIE = FA.getFIEFor(BC, Inst);
+      auto FIE = FA.getFIEFor(Inst);
      assert(FIE);
      if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr)
        continue;
@ -499,7 +499,7 @@ bool StackLayoutModifier::insertRegion(ProgramPoint P, int64_t RegionSz) {
        continue;
      }

-      auto FIE = FA.getFIEFor(BC, Inst);
+      auto FIE = FA.getFIEFor(Inst);
      assert(FIE);
      if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr)
        continue;
--- a/bolt/Passes/StackAvailableExpressions.cpp
+++ b/bolt/Passes/StackAvailableExpressions.cpp
@ -17,10 +17,11 @@
 namespace llvm {
 namespace bolt {

-StackAvailableExpressions::StackAvailableExpressions(const FrameAnalysis &FA,
+StackAvailableExpressions::StackAvailableExpressions(const RegAnalysis &RA,
+                                                     const FrameAnalysis &FA,
                                                     const BinaryContext &BC,
                                                     BinaryFunction &BF)
-    : InstrsDataflowAnalysis(BC, BF), FA(FA) {}
+    : InstrsDataflowAnalysis(BC, BF), RA(RA), FA(FA) {}

 void StackAvailableExpressions::preflight() {
  DEBUG(dbgs() << "Starting StackAvailableExpressions on \""
@ -31,7 +32,7 @@ void StackAvailableExpressions::preflight() {
  // program.
  for (auto &BB : Func) {
    for (auto &Inst : BB) {
-      auto FIE = FA.getFIEFor(BC, Inst);
+      auto FIE = FA.getFIEFor(Inst);
      if (!FIE)
        continue;
      if (FIE->IsStore == true && FIE->IsSimple == true) {
@ -80,8 +81,8 @@ bool isLoadRedundant(const FrameIndexEntry &LoadFIE,
 bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) {
  // if both are stores, and both store to the same stack location, return
  // true
-  auto FIEX = FA.getFIEFor(BC, *X);
-  auto FIEY = FA.getFIEFor(BC, *Y);
+  auto FIEX = FA.getFIEFor(*X);
+  auto FIEY = FA.getFIEFor(*Y);
  if (FIEX && FIEY) {
    if (isLoadRedundant(*FIEX, *FIEY))
      return false;
@ -93,14 +94,14 @@ bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) {
  // getClobberedRegs for X and Y. If they intersect, return true
  BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false);
  BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false);
-  FA.getInstClobberList(BC, *X, XClobbers);
+  RA.getInstClobberList(*X, XClobbers);
  // If Y is a store to stack, its clobber list is its source reg. This is
  // different than the rest because we want to check if the store source
  // reaches its corresponding load untouched.
  if (FIEY && FIEY->IsStore == true && FIEY->IsStoreFromReg) {
    YClobbers.set(FIEY->RegOrImm);
  } else {
-    FA.getInstClobberList(BC, *Y, YClobbers);
+    RA.getInstClobberList(*Y, YClobbers);
  }
  XClobbers &= YClobbers;
  return XClobbers.any();
@ -121,7 +122,7 @@ BitVector StackAvailableExpressions::computeNext(const MCInst &Point,
    }
  }
  // Gen
-  if (auto FIE = FA.getFIEFor(BC, Point)) {
+  if (auto FIE = FA.getFIEFor(Point)) {
    if (FIE->IsStore == true && FIE->IsSimple == true)
      Next.set(ExprToIdx[&Point]);
  }
--- a/bolt/Passes/StackAvailableExpressions.h
+++ b/bolt/Passes/StackAvailableExpressions.h
@ -13,6 +13,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H

 #include "DataflowAnalysis.h"
+#include "RegAnalysis.h"
 #include "llvm/Support/Timer.h"

 namespace llvm {
@ -25,7 +26,7 @@ class StackAvailableExpressions
  friend class DataflowAnalysis<StackAvailableExpressions, BitVector>;

 public:
-  StackAvailableExpressions(const FrameAnalysis &FA,
+  StackAvailableExpressions(const RegAnalysis &RA, const FrameAnalysis &FA,
                            const BinaryContext &BC, BinaryFunction &BF);
  virtual ~StackAvailableExpressions() {}

@ -35,7 +36,7 @@ public:
  }

 protected:
-  /// Reference to the result of stack frame analysis
+  const RegAnalysis &RA;
  const FrameAnalysis &FA;

  void preflight();
--- a/bolt/Passes/StackReachingUses.cpp
+++ b/bolt/Passes/StackReachingUses.cpp
@ -22,7 +22,7 @@ bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE,
  for (auto I = Candidates; I != expr_end(); ++I) {
    const MCInst *ReachingInst = *I;
    if (IncludeLocalAccesses) {
-      if (auto FIEY = FA.getFIEFor(BC, *ReachingInst)) {
+      if (auto FIEY = FA.getFIEFor(*ReachingInst)) {
        assert(FIEY->IsLoad == 1);
        if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset &&
            StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size) {
@ -30,7 +30,7 @@ bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE,
        }
      }
    }
-    auto Args = FA.getArgAccessesFor(BC, *ReachingInst);
+    auto Args = FA.getArgAccessesFor(*ReachingInst);
    if (!Args)
      continue;
    if (Args->AssumeEverything) {
@ -55,14 +55,14 @@ void StackReachingUses::preflight() {
  // program.
  for (auto &BB : Func) {
    for (auto &Inst : BB) {
-      if (auto FIE = FA.getFIEFor(BC, Inst)) {
+      if (auto FIE = FA.getFIEFor(Inst)) {
        if (FIE->IsLoad == true) {
          Expressions.push_back(&Inst);
          ExprToIdx[&Inst] = NumInstrs++;
          continue;
        }
      }
-      auto AA = FA.getArgAccessesFor(BC, Inst);
+      auto AA = FA.getArgAccessesFor(Inst);
      if (AA && (!AA->Set.empty() || AA->AssumeEverything)) {
        Expressions.push_back(&Inst);
        ExprToIdx[&Inst] = NumInstrs++;
@ -74,8 +74,8 @@ void StackReachingUses::preflight() {
 bool StackReachingUses::doesXKillsY(const MCInst *X, const MCInst *Y) {
  // if X is a store to the same stack location and the bytes fetched is a
  // superset of those bytes affected by the load in Y, return true
-  auto FIEX = FA.getFIEFor(BC, *X);
-  auto FIEY = FA.getFIEFor(BC, *Y);
+  auto FIEX = FA.getFIEFor(*X);
+  auto FIEY = FA.getFIEFor(*Y);
  if (FIEX && FIEY) {
    if (FIEX->IsStore == true && FIEY->IsLoad == true &&
        FIEX->StackOffset <= FIEY->StackOffset &&
@ -98,11 +98,11 @@ BitVector StackReachingUses::computeNext(const MCInst &Point,
    }
  };
  // Gen
-  if (auto FIE = FA.getFIEFor(BC, Point)) {
+  if (auto FIE = FA.getFIEFor(Point)) {
    if (FIE->IsLoad == true)
      Next.set(ExprToIdx[&Point]);
  }
-  auto AA = FA.getArgAccessesFor(BC, Point);
+  auto AA = FA.getArgAccessesFor(Point);
  if (AA && (!AA->Set.empty() || AA->AssumeEverything))
    Next.set(ExprToIdx[&Point]);
  return Next;