[BOLT] Move BOLT passes under Passes subdirectory (NFC).

Summary: Move passes under Passes subdirectory. Move inlining passes under Passes/Inliner.* (cherry picked from FBD4575832)
2017-02-16 14:57:57 -08:00 · 2017-02-16 14:57:57 -08:00 · 88244a10bb
parent f06a1455ea
commit 88244a10bb
13 changed files with 3767 additions and 11 deletions
--- a/bolt/BinaryFunction.cpp
+++ b/bolt/BinaryFunction.cpp
@ -12,8 +12,8 @@

 #include "BinaryBasicBlock.h"
 #include "BinaryFunction.h"
-#include "ReorderAlgorithm.h"
 #include "DataReader.h"
+#include "Passes/ReorderAlgorithm.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/MC/MCAsmInfo.h"
--- a/bolt/BinaryPassManager.cpp
+++ b/bolt/BinaryPassManager.cpp
@ -10,7 +10,8 @@
 //===----------------------------------------------------------------------===//

 #include "BinaryPassManager.h"
-#include "FrameOptimizerPass.h"
+#include "Passes/FrameOptimizer.h"
+#include "Passes/Inliner.h"
 #include "llvm/Support/Timer.h"

 using namespace llvm;
--- a/bolt/BinaryPassManager.h
+++ b/bolt/BinaryPassManager.h
@ -15,7 +15,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H

 #include "BinaryFunction.h"
-#include "BinaryPasses.h"
+#include "Passes/BinaryPasses.h"
 #include "llvm/Support/Options.h"
 #include "llvm/Support/CommandLine.h"
 #include <map>
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@ -1,7 +1,9 @@
 add_subdirectory(merge-fdata)
+add_subdirectory(Passes)

 set(LLVM_LINK_COMPONENTS
  ${LLVM_TARGETS_TO_BUILD}
+  BOLTPasses
  CodeGen
  Core
  DebugInfoDWARF
@ -18,13 +20,10 @@ add_llvm_tool(llvm-bolt
  BinaryBasicBlock.cpp
  BinaryContext.cpp
  BinaryFunction.cpp
-  BinaryPasses.cpp
  BinaryPassManager.cpp
  DataReader.cpp
  DebugData.cpp
  Exceptions.cpp
-  FrameOptimizerPass.cpp
  RewriteInstance.cpp
-  ReorderAlgorithm.cpp
  DWARFRewriter.cpp
  )
--- a/bolt/Passes/BinaryPasses.cpp
+++ b/bolt/Passes/BinaryPasses.cpp
--- a/bolt/Passes/BinaryPasses.h
+++ b/bolt/Passes/BinaryPasses.h
@ -0,0 +1,490 @@
+//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The set of optimization/analysis passes that run on BinaryFunctions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/CommandLine.h"
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace llvm {
+namespace bolt {
+
+/// An optimization/analysis pass that runs on functions.
+class BinaryFunctionPass {
+  const cl::opt<bool> &PrintPass;
+protected:
+  explicit BinaryFunctionPass(const cl::opt<bool> &PrintPass)
+    : PrintPass(PrintPass) { }
+
+  /// Control whether a specific function should be skipped during
+  /// optimization.
+  bool shouldOptimize(const BinaryFunction &BF) const;
+public:
+  virtual ~BinaryFunctionPass() = default;
+
+  /// The name of this pass
+  virtual const char *getName() const = 0;
+
+  /// Control whether debug info is printed after this pass is completed.
+  bool printPass() const { return PrintPass; }
+
+  /// Control whether debug info is printed for an individual function after
+  /// this pass is completed (printPass() must have returned true).
+  virtual bool shouldPrint(const BinaryFunction &BF) const;
+
+  /// Execute this pass on the given functions.
+  virtual void runOnFunctions(BinaryContext &BC,
+                              std::map<uint64_t, BinaryFunction> &BFs,
+                              std::set<uint64_t> &LargeFunctions) = 0;
+};
+
+/// Detects functions that simply do a tail call when they are called and
+/// optimizes calls to these functions.
+class OptimizeBodylessFunctions : public BinaryFunctionPass {
+private:
+  /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G,
+  /// thus calls to F can be optimized to calls to G.
+  std::unordered_map<const MCSymbol *, const BinaryFunction *>
+    EquivalentCallTarget;
+
+  void analyze(BinaryFunction &BF,
+               BinaryContext &BC,
+               std::map<uint64_t, BinaryFunction> &BFs);
+
+  void optimizeCalls(BinaryFunction &BF,
+                     BinaryContext &BC);
+
+  /// Stats for eliminated calls.
+  uint64_t NumEliminatedCalls{0};
+  uint64_t NumOptimizedCallSites{0};
+
+public:
+  explicit OptimizeBodylessFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+  const char *getName() const override {
+    return "optimize-bodyless";
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// Detect and eliminate unreachable basic blocks. We could have those
+/// filled with nops and they are used for alignment.
+class EliminateUnreachableBlocks : public BinaryFunctionPass {
+  std::unordered_set<const BinaryFunction *> Modified;
+  unsigned DeletedBlocks{0};
+  uint64_t DeletedBytes{0};
+  void runOnFunction(BinaryFunction& Function);
+ public:
+  EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "eliminate-unreachable";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext&,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+// Reorder the basic blocks for each function based on hotness.
+class ReorderBasicBlocks : public BinaryFunctionPass {
+ public:
+  explicit ReorderBasicBlocks(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "reordering";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override;
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// Sync local branches with CFG.
+class FixupBranches : public BinaryFunctionPass {
+ public:
+  explicit FixupBranches(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "fix-branches";
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// Fix the CFI state and exception handling information after all other
+/// passes have completed.
+class FixupFunctions : public BinaryFunctionPass {
+ public:
+  explicit FixupFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "fixup-functions";
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// An optimization to simplify conditional tail calls by removing
+/// unnecessary branches.
+///
+/// This optimization considers both of the following cases:
+///
+/// foo: ...
+///      jcc L1   original
+///      ...
+/// L1:  jmp bar  # TAILJMP
+///
+/// ->
+///
+/// foo: ...
+///      jcc bar  iff jcc L1 is expected
+///      ...
+///
+/// L1 is unreachable
+///
+/// OR
+///
+/// foo: ...
+///      jcc  L2
+/// L1:  jmp  dest  # TAILJMP
+/// L2:  ...
+///
+/// ->
+///
+/// foo: jncc dest  # TAILJMP
+/// L2:  ...
+///
+/// L1 is unreachable
+///
+/// For this particular case, the first basic block ends with
+/// a conditional branch and has two successors, one fall-through
+/// and one for when the condition is true.
+/// The target of the conditional is a basic block with a single
+/// unconditional branch (i.e. tail call) to another function.
+/// We don't care about the contents of the fall-through block.
+/// We assume that the target of the conditional branch is the
+/// first successor.
+class SimplifyConditionalTailCalls : public BinaryFunctionPass {
+  uint64_t NumCandidateTailCalls{0};
+  uint64_t NumTailCallsPatched{0};
+  uint64_t NumOrigForwardBranches{0};
+  uint64_t NumOrigBackwardBranches{0};
+  std::unordered_set<const BinaryFunction *> Modified;
+
+  bool shouldRewriteBranch(const BinaryBasicBlock *PredBB,
+                           const MCInst &CondBranch,
+                           const BinaryBasicBlock *BB,
+                           const bool DirectionFlag);
+
+  uint64_t fixTailCalls(BinaryContext &BC, BinaryFunction &BF);
+ public:
+  explicit SimplifyConditionalTailCalls(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "simplify-conditional-tail-calls";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// Perform simple peephole optimizations.
+class Peepholes : public BinaryFunctionPass {
+  uint64_t NumDoubleJumps{0};
+  uint64_t TailCallTraps{0};
+
+  /// Attempt to use the minimum operand width for arithmetic, branch and
+  /// move instructions.
+  void shortenInstructions(BinaryContext &BC, BinaryFunction &Function);
+
+  /// Replace double jumps with a jump directly to the target, i.e.
+  /// jmp/jcc L1; L1: jmp L2 -> jmp/jcc L2.
+  void fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function);
+
+  /// Add trap instructions immediately after indirect tail calls to prevent
+  /// the processor from decoding instructions immediate following the
+  /// tailcall.
+  void addTailcallTraps(BinaryContext &BC, BinaryFunction &Function);
+ public:
+  explicit Peepholes(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "peepholes";
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// An optimization to simplify loads from read-only sections.The pass converts
+/// load instructions with statically computed target address such as:
+///
+///      mov 0x12f(%rip), %eax
+///
+/// to their counterparts that use immediate opreands instead of memory loads:
+///
+///     mov $0x4007dc, %eax
+///
+/// when the target address points somewhere inside a read-only section.
+///
+class SimplifyRODataLoads : public BinaryFunctionPass {
+  uint64_t NumLoadsSimplified{0};
+  uint64_t NumDynamicLoadsSimplified{0};
+  uint64_t NumLoadsFound{0};
+  uint64_t NumDynamicLoadsFound{0};
+  std::unordered_set<const BinaryFunction *> Modified;
+
+  bool simplifyRODataLoads(BinaryContext &BC, BinaryFunction &BF);
+
+public:
+  explicit SimplifyRODataLoads(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "simplify-read-only-loads";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// An optimization that replaces references to identical functions with
+/// references to a single one of them.
+///
+class IdenticalCodeFolding : public BinaryFunctionPass {
+public:
+  explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "identical-code-folding";
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+///
+/// Prints a list of the top 100 functions sorted by a set of
+/// dyno stats categories.
+///
+class PrintSortedBy : public BinaryFunctionPass {
+ public:
+  explicit PrintSortedBy(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "print-sorted-by";
+  }
+  bool shouldPrint(const BinaryFunction &) const override {
+    return false;
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// Optimize indirect calls.
+/// The indirect call promotion pass visits each indirect call and
+/// examines the BranchData for each. If the most frequent targets
+/// from that callsite exceed the specified threshold (default 90%),
+/// the call is promoted. Otherwise, it is ignored. By default,
+/// only one target is considered at each callsite.
+/// 
+/// When an candidate callsite is processed, we modify the callsite
+/// to test for the most common call targets before calling through
+/// the original generic call mechanism.
+/// 
+/// The CFG and layout are modified by ICP.
+/// 
+/// A few new command line options have been added:
+///   -indirect-call-promotion
+///   -indirect-call-promotion-threshold=<percentage>
+///   -indirect-call-promotion-mispredict-threshold=<percentage>
+///   -indirect-call-promotion-topn=<int>
+///
+/// The threshold is the minimum frequency of a call target needed
+/// before ICP is triggered.
+///
+/// The mispredict threshold is used to disable the optimization at
+/// any callsite where the branch predictor does a good enough job
+/// that ICP wouldn't help regardless of the frequency of the most
+/// common target.
+/// 
+/// The topn option controls the number of targets to consider for
+/// each callsite, e.g. ICP is triggered if topn=2 and the total
+/// frequency of the top two call targets exceeds the threshold.
+///
+/// The minimize code size option controls whether or not the hot
+/// calls are to registers (callq %r10) or to function addresses
+/// (callq $foo).
+///
+/// Example of ICP:
+/// 
+/// C++ code:
+/// 
+///   int B_count = 0;
+///   int C_count = 0;
+/// 
+///   struct A { virtual void foo() = 0; }
+///   struct B : public A { virtual void foo() { ++B_count; }; };
+///   struct C : public A { virtual void foo() { ++C_count; }; };
+/// 
+///   A* a = ...
+///   a->foo();
+///   ...
+/// 
+/// original assembly:
+/// 
+///   B0: 49 8b 07             mov    (%r15),%rax
+///       4c 89 ff             mov    %r15,%rdi
+///       ff 10                callq  *(%rax)
+///       41 83 e6 01          and    $0x1,%r14d
+///       4d 89 e6             mov    %r12,%r14
+///       4c 0f 44 f5          cmove  %rbp,%r14
+///       4c 89 f7             mov    %r14,%rdi
+///       ...
+/// 
+/// after ICP:
+/// 
+///   B0: 49 8b 07             mov    (%r15),%rax
+///       4c 89 ff             mov    %r15,%rdi
+///       48 81 38 e0 0b 40 00 cmpq   $B::foo,(%rax)
+///       75 29                jne    B3
+///   B1: e8 45 03 00 00       callq  $B::foo
+///   B2: 41 83 e6 01          and    $0x1,%r14d
+///       4d 89 e6             mov    %r12,%r14
+///       4c 0f 44 f5          cmove  %rbp,%r14
+///       4c 89 f7             mov    %r14,%rdi
+///       ...
+/// 
+///   B3: ff 10                callq  *(%rax)
+///       eb d6                jmp    B2
+///
+class IndirectCallPromotion : public BinaryFunctionPass {
+  using BasicBlocksVector = std::vector<std::unique_ptr<BinaryBasicBlock>>;
+  std::unordered_set<const BinaryFunction *> Modified;
+  // Total number of calls from all callsites.
+  uint64_t TotalCalls{0};
+
+  // Total number of indirect calls from all callsites.
+  // (a fraction of TotalCalls)
+  uint64_t TotalIndirectCalls{0};
+
+  // Total number of callsites that use indirect calls.
+  // (the total number of callsites is not recorded)
+  uint64_t TotalIndirectCallsites{0};
+
+  // Total number of indirect callsites that are optimized by ICP.
+  // (a fraction of TotalIndirectCallsites)
+  uint64_t TotalOptimizedIndirectCallsites{0};
+
+  // Total number of indirect calls that are optimized by ICP.
+  // (a fraction of TotalCalls)
+  uint64_t TotalNumFrequentCalls{0};
+
+  std::vector<BranchInfo> getCallTargets(BinaryContext &BC,
+                                         const FuncBranchData &BranchData,
+                                         const MCInst &Inst) const;
+
+  size_t canPromoteCallsite(const BinaryBasicBlock *BB,
+                            const MCInst &Inst,
+                            const std::vector<BranchInfo> &Targets,
+                            uint64_t NumCalls);
+
+  void printCallsiteInfo(const BinaryBasicBlock *BB,
+                         const MCInst &Inst,
+                         const std::vector<BranchInfo> &Targets,
+                         const size_t N,
+                         uint64_t NumCalls) const;
+
+  std::vector<std::pair<MCSymbol *, uint64_t>>
+  findCallTargetSymbols(BinaryContext &BC,
+                        const std::vector<BranchInfo> &Targets,
+                        const size_t N) const;
+
+  std::vector<std::unique_ptr<BinaryBasicBlock>>
+  rewriteCall(BinaryContext &BC,
+              BinaryFunction &Function,
+              BinaryBasicBlock *IndCallBlock,
+              const MCInst &CallInst,
+              MCInstrAnalysis::ICPdata &&ICPcode) const;
+
+  BinaryBasicBlock *fixCFG(BinaryContext &BC,
+                           BinaryFunction &Function,
+                           BinaryBasicBlock *IndCallBlock,
+                           const bool IsTailCall,
+                           BasicBlocksVector &&NewBBs,
+                           const std::vector<BranchInfo> &Targets) const;
+
+ public:
+  explicit IndirectCallPromotion(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const {
+    return "indirect-call-promotion";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+/// Pass for lowering any instructions that we have raised and that have
+/// to be lowered.
+class InstructionLowering : public BinaryFunctionPass {
+public:
+  explicit InstructionLowering(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "inst-lowering";
+  }
+
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
--- a/bolt/Passes/CMakeLists.txt
+++ b/bolt/Passes/CMakeLists.txt
@ -0,0 +1,8 @@
+add_llvm_library(LLVMBOLTPasses
+  BinaryPasses.cpp
+  FrameOptimizer.cpp
+  Inliner.cpp
+  ReorderAlgorithm.cpp
+  )
+
+include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt )
--- a/bolt/Passes/FrameOptimizer.cpp
+++ b/bolt/Passes/FrameOptimizer.cpp
@ -1,4 +1,4 @@
-//===--- FrameOptimizerPass.cpp -------------------------------------------===//
+//===--- Passes/FrameOptimizer.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//

-#include "FrameOptimizerPass.h"
+#include "FrameOptimizer.h"
 #include <queue>
 #include <unordered_map>

--- a/bolt/Passes/FrameOptimizer.h
+++ b/bolt/Passes/FrameOptimizer.h
@ -1,4 +1,4 @@
-//===--- FrameOptimizerPass.h ---------------------------------------------===//
+//===--- Passes/FrameOptimizer.h ------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -9,8 +9,8 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef FRAMEOPTIMIZERPASS_H
-#define FRAMEOPTIMIZERPASS_H
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H

 #include "BinaryPasses.h"

--- a/bolt/Passes/Inliner.cpp
+++ b/bolt/Passes/Inliner.cpp
@ -0,0 +1,609 @@
+//===--- Passes/Inliner.cpp - Inlining infra for BOLT ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Inliner.h"
+#include "llvm/Support/Options.h"
+
+#define DEBUG_TYPE "bolt-inliner"
+
+using namespace llvm;
+
+namespace opts {
+static cl::list<std::string>
+ForceInlineFunctions("force-inline",
+                     cl::CommaSeparated,
+                     cl::desc("list of functions to always consider "
+                              "for inlining"),
+                     cl::value_desc("func1,func2,func3,..."),
+                     cl::Hidden);
+
+static cl::opt<bool>
+AggressiveInlining("aggressive-inlining",
+                   cl::desc("perform aggressive inlining"),
+                   cl::ZeroOrMore,
+                   cl::Hidden);
+
+}
+
+
+namespace llvm {
+namespace bolt {
+
+void InlineSmallFunctions::findInliningCandidates(
+    BinaryContext &BC,
+    const std::map<uint64_t, BinaryFunction> &BFs) {
+  for (const auto &BFIt : BFs) {
+    const auto &Function = BFIt.second;
+    if (!shouldOptimize(Function) || Function.size() != 1)
+      continue;
+    auto &BB = *Function.begin();
+    const auto &LastInstruction = *BB.rbegin();
+    // Check if the function is small enough, doesn't do a tail call
+    // and doesn't throw exceptions.
+    if (BB.size() > 0 &&
+        BB.getNumNonPseudos() <= kMaxInstructions &&
+        BB.lp_empty() &&
+        BC.MIA->isReturn(LastInstruction) &&
+        !BC.MIA->isTailCall(LastInstruction)) {
+      InliningCandidates.insert(&Function);
+    }
+  }
+
+  DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size()
+               << " inlineable functions.\n");
+}
+
+void InlineSmallFunctions::findInliningCandidatesAggressive(
+    BinaryContext &BC,
+    const std::map<uint64_t, BinaryFunction> &BFs) {
+  std::set<std::string> OverwrittenFunctions = {
+    "_ZN4HPHP13hash_string_iEPKcj",
+    "_ZN4HPHP21hash_string_cs_unsafeEPKcj",
+    "_ZN4HPHP14hash_string_csEPKcj",
+    "_ZN4HPHP20hash_string_i_unsafeEPKcj",
+    "_ZNK4HPHP10StringData10hashHelperEv"
+  };
+  for (const auto &BFIt : BFs) {
+    const auto &Function = BFIt.second;
+    if (!shouldOptimize(Function) ||
+        OverwrittenFunctions.count(Function.getSymbol()->getName()) ||
+        Function.hasEHRanges())
+      continue;
+    uint64_t FunctionSize = 0;
+    for (const auto *BB : Function.layout()) {
+      FunctionSize += BC.computeCodeSize(BB->begin(), BB->end());
+    }
+    assert(FunctionSize > 0 && "found empty function");
+    if (FunctionSize > kMaxSize)
+      continue;
+    bool FoundCFI = false;
+    for (const auto BB : Function.layout()) {
+      for (const auto &Inst : *BB) {
+        if (BC.MIA->isEHLabel(Inst) || BC.MIA->isCFI(Inst)) {
+          FoundCFI = true;
+          break;
+        }
+      }
+    }
+    if (!FoundCFI)
+      InliningCandidates.insert(&Function);
+  }
+
+  DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size()
+               << " inlineable functions.\n");
+}
+
+namespace {
+
+/// Returns whether a function creates a stack frame for itself or not.
+/// If so, we need to manipulate the stack pointer when calling this function.
+/// Since we're only inlining very small functions, we return false for now, but
+/// we could for instance check if the function starts with 'push ebp'.
+/// TODO generalize this.
+bool createsStackFrame(const BinaryBasicBlock &) {
+  return false;
+}
+
+} // namespace
+
+void InlineSmallFunctions::inlineCall(
+    BinaryContext &BC,
+    BinaryBasicBlock &BB,
+    MCInst *CallInst,
+    const BinaryBasicBlock &InlinedFunctionBB) {
+  assert(BC.MIA->isCall(*CallInst) && "Can only inline a call.");
+  assert(BC.MIA->isReturn(*InlinedFunctionBB.rbegin()) &&
+         "Inlined function should end with a return.");
+
+  std::vector<MCInst> InlinedInstance;
+
+  bool ShouldAdjustStack = createsStackFrame(InlinedFunctionBB);
+
+  // Move stack like 'call' would if needed.
+  if (ShouldAdjustStack) {
+    MCInst StackInc;
+    BC.MIA->createStackPointerIncrement(StackInc);
+    InlinedInstance.push_back(StackInc);
+  }
+
+  for (auto Instruction : InlinedFunctionBB) {
+    if (BC.MIA->isReturn(Instruction)) {
+      break;
+    }
+    if (!BC.MIA->isEHLabel(Instruction) &&
+        !BC.MIA->isCFI(Instruction)) {
+      InlinedInstance.push_back(Instruction);
+    }
+  }
+
+  // Move stack pointer like 'ret' would.
+  if (ShouldAdjustStack) {
+    MCInst StackDec;
+    BC.MIA->createStackPointerDecrement(StackDec);
+    InlinedInstance.push_back(StackDec);
+  }
+
+  BB.replaceInstruction(CallInst, InlinedInstance);
+}
+
+std::pair<BinaryBasicBlock *, unsigned>
+InlineSmallFunctions::inlineCall(
+    BinaryContext &BC,
+    BinaryFunction &CallerFunction,
+    BinaryBasicBlock *CallerBB,
+    const unsigned CallInstIndex,
+    const BinaryFunction &InlinedFunction) {
+  // Get the instruction to be replaced with inlined code.
+  MCInst &CallInst = CallerBB->getInstructionAtIndex(CallInstIndex);
+  assert(BC.MIA->isCall(CallInst) && "Can only inline a call.");
+
+  // Point in the function after the inlined code.
+  BinaryBasicBlock *AfterInlinedBB = nullptr;
+  unsigned AfterInlinedIstrIndex = 0;
+
+  // In case of a tail call we should not remove any ret instructions from the
+  // inlined instance.
+  bool IsTailCall = BC.MIA->isTailCall(CallInst);
+
+  // The first block of the function to be inlined can be merged with the caller
+  // basic block. This cannot happen if there are jumps to the first block.
+  bool CanMergeFirstInlinedBlock = (*InlinedFunction.begin()).pred_size() == 0;
+
+  // If the call to be inlined is not at the end of its basic block and we have
+  // to inline more than one basic blocks (or even just one basic block that
+  // cannot be merged into the caller block), then the caller's basic block
+  // should be split.
+  bool ShouldSplitCallerBB =
+    CallInstIndex < CallerBB->size() - 1 &&
+    (InlinedFunction.size() > 1 || !CanMergeFirstInlinedBlock);
+
+  // Copy inlined function's basic blocks into a vector of basic blocks that
+  // will be inserted in the caller function (the inlined instance). Also, we
+  // keep a mapping from basic block index to the corresponding block in the
+  // inlined instance.
+  std::vector<std::unique_ptr<BinaryBasicBlock>> InlinedInstance;
+  std::unordered_map<const BinaryBasicBlock *, BinaryBasicBlock *> InlinedBBMap;
+
+  for (const auto InlinedFunctionBB : InlinedFunction.layout()) {
+    InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0));
+    InlinedBBMap[InlinedFunctionBB] = InlinedInstance.back().get();
+    if (InlinedFunction.hasValidProfile()) {
+      const auto Count = InlinedFunctionBB->getExecutionCount();
+      InlinedInstance.back()->setExecutionCount(Count);
+    }
+  }
+  if (ShouldSplitCallerBB) {
+    // Add one extra block at the inlined instance for the removed part of the
+    // caller block.
+    InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0));
+    if (CallerFunction.hasValidProfile()) {
+      const auto Count = CallerBB->getExecutionCount();
+      InlinedInstance.back()->setExecutionCount(Count);
+    }
+  }
+
+  // Copy instructions to the basic blocks of the inlined instance.
+  bool First = true;
+  for (const auto InlinedFunctionBB : InlinedFunction.layout()) {
+    // Get the corresponding block of the inlined instance.
+    auto *InlinedInstanceBB = InlinedBBMap.at(InlinedFunctionBB);
+    bool IsExitingBlock = false;
+
+    // Copy instructions into the inlined instance.
+    for (auto Instruction : *InlinedFunctionBB) {
+      if (!IsTailCall &&
+          BC.MIA->isReturn(Instruction) &&
+          !BC.MIA->isTailCall(Instruction)) {
+        // Skip returns when the caller does a normal call as opposed to a tail
+        // call.
+        IsExitingBlock = true;
+        continue;
+      }
+      if (!IsTailCall &&
+          BC.MIA->isTailCall(Instruction)) {
+        // Convert tail calls to normal calls when the caller does a normal
+        // call.
+        if (!BC.MIA->convertTailCallToCall(Instruction))
+           assert(false && "unexpected tail call opcode found");
+        IsExitingBlock = true;
+      }
+      if (BC.MIA->isBranch(Instruction) &&
+          !BC.MIA->isIndirectBranch(Instruction)) {
+        // Convert the branch targets in the branch instructions that will be
+        // added to the inlined instance.
+        const MCSymbol *OldTargetLabel = nullptr;
+        const MCSymbol *OldFTLabel = nullptr;
+        MCInst *CondBranch = nullptr;
+        MCInst *UncondBranch = nullptr;
+        const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel,
+                                                  OldFTLabel, CondBranch,
+                                                  UncondBranch);
+        assert(Result &&
+               "analyzeBranch failed on instruction guaranteed to be a branch");
+        assert(OldTargetLabel);
+        const MCSymbol *NewTargetLabel = nullptr;
+        for (const auto SuccBB : InlinedFunctionBB->successors()) {
+          if (SuccBB->getLabel() == OldTargetLabel) {
+            NewTargetLabel = InlinedBBMap.at(SuccBB)->getLabel();
+            break;
+          }
+        }
+        assert(NewTargetLabel);
+        BC.MIA->replaceBranchTarget(Instruction, NewTargetLabel, BC.Ctx.get());
+      }
+      // TODO; Currently we simply ignore CFI instructions but we need to
+      // address them for correctness.
+      if (!BC.MIA->isEHLabel(Instruction) &&
+          !BC.MIA->isCFI(Instruction)) {
+        InlinedInstanceBB->addInstruction(std::move(Instruction));
+      }
+    }
+
+    // Add CFG edges to the basic blocks of the inlined instance.
+    std::vector<BinaryBasicBlock *>
+      Successors(InlinedFunctionBB->succ_size(), nullptr);
+
+    std::transform(
+        InlinedFunctionBB->succ_begin(),
+        InlinedFunctionBB->succ_end(),
+        Successors.begin(),
+        [&InlinedBBMap](const BinaryBasicBlock *BB) {
+          return InlinedBBMap.at(BB);
+        });
+
+    if (InlinedFunction.hasValidProfile()) {
+      InlinedInstanceBB->addSuccessors(
+          Successors.begin(),
+          Successors.end(),
+          InlinedFunctionBB->branch_info_begin(),
+          InlinedFunctionBB->branch_info_end());
+    } else {
+      InlinedInstanceBB->addSuccessors(
+          Successors.begin(),
+          Successors.end());
+    }
+
+    if (IsExitingBlock) {
+      assert(Successors.size() == 0);
+      if (ShouldSplitCallerBB) {
+        if (InlinedFunction.hasValidProfile()) {
+          InlinedInstanceBB->addSuccessor(
+              InlinedInstance.back().get(),
+              InlinedInstanceBB->getExecutionCount());
+        } else {
+          InlinedInstanceBB->addSuccessor(InlinedInstance.back().get());
+        }
+        InlinedInstanceBB->addBranchInstruction(InlinedInstance.back().get());
+      } else if (!First || !CanMergeFirstInlinedBlock) {
+        assert(CallInstIndex == CallerBB->size() - 1);
+        assert(CallerBB->succ_size() <= 1);
+        if (CallerBB->succ_size() == 1) {
+          if (InlinedFunction.hasValidProfile()) {
+            InlinedInstanceBB->addSuccessor(
+                *CallerBB->succ_begin(),
+                InlinedInstanceBB->getExecutionCount());
+          } else {
+            InlinedInstanceBB->addSuccessor(*CallerBB->succ_begin());
+          }
+          InlinedInstanceBB->addBranchInstruction(*CallerBB->succ_begin());
+        }
+      }
+    }
+
+    First = false;
+  }
+
+  if (ShouldSplitCallerBB) {
+    // Split the basic block that contains the call and add the removed
+    // instructions in the last block of the inlined instance.
+    // (Is it OK to have a basic block with just CFI instructions?)
+    std::vector<MCInst> TrailInstructions =
+        CallerBB->splitInstructions(&CallInst);
+    assert(TrailInstructions.size() > 0);
+    InlinedInstance.back()->addInstructions(
+        TrailInstructions.begin(),
+        TrailInstructions.end());
+    // Add CFG edges for the block with the removed instructions.
+    if (CallerFunction.hasValidProfile()) {
+      InlinedInstance.back()->addSuccessors(
+          CallerBB->succ_begin(),
+          CallerBB->succ_end(),
+          CallerBB->branch_info_begin(),
+          CallerBB->branch_info_end());
+    } else {
+      InlinedInstance.back()->addSuccessors(
+          CallerBB->succ_begin(),
+          CallerBB->succ_end());
+    }
+    // Update the after-inlined point.
+    AfterInlinedBB = InlinedInstance.back().get();
+    AfterInlinedIstrIndex = 0;
+  }
+
+  assert(InlinedInstance.size() > 0 && "found function with no basic blocks");
+  assert(InlinedInstance.front()->size() > 0 &&
+         "found function with empty basic block");
+
+  // If the inlining cannot happen as a simple instruction insertion into
+  // CallerBB, we remove the outgoing CFG edges of the caller block.
+  if (InlinedInstance.size() > 1 || !CanMergeFirstInlinedBlock) {
+    CallerBB->removeSuccessors(CallerBB->succ_begin(), CallerBB->succ_end());
+    if (!ShouldSplitCallerBB) {
+      // Update the after-inlined point.
+      AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB);
+      AfterInlinedIstrIndex = 0;
+    }
+  } else {
+    assert(!ShouldSplitCallerBB);
+    // Update the after-inlined point.
+    if (CallInstIndex < CallerBB->size() - 1) {
+      AfterInlinedBB = CallerBB;
+      AfterInlinedIstrIndex =
+        CallInstIndex + InlinedInstance.front()->size();
+    } else {
+      AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB);
+      AfterInlinedIstrIndex = 0;
+    }
+  }
+
+  // Do the inlining by merging the first block of the inlined instance into
+  // the caller basic block if possible and adding the rest of the inlined
+  // instance basic blocks in the caller function.
+  if (CanMergeFirstInlinedBlock) {
+    CallerBB->replaceInstruction(
+        &CallInst,
+        InlinedInstance.front()->begin(),
+        InlinedInstance.front()->end());
+    if (InlinedInstance.size() > 1) {
+      auto FirstBB = InlinedInstance.begin()->get();
+      if (InlinedFunction.hasValidProfile()) {
+        CallerBB->addSuccessors(
+            FirstBB->succ_begin(),
+            FirstBB->succ_end(),
+            FirstBB->branch_info_begin(),
+            FirstBB->branch_info_end());
+      } else {
+        CallerBB->addSuccessors(
+            FirstBB->succ_begin(),
+            FirstBB->succ_end());
+      }
+      FirstBB->removeSuccessors(FirstBB->succ_begin(), FirstBB->succ_end());
+    }
+    InlinedInstance.erase(InlinedInstance.begin());
+  } else {
+    CallerBB->eraseInstruction(&CallInst);
+    if (CallerFunction.hasValidProfile()) {
+      CallerBB->addSuccessor(InlinedInstance.front().get(),
+                             CallerBB->getExecutionCount());
+    } else {
+      CallerBB->addSuccessor(InlinedInstance.front().get(),
+                             CallerBB->getExecutionCount());
+    }
+  }
+  CallerFunction.insertBasicBlocks(CallerBB, std::move(InlinedInstance));
+
+  return std::make_pair(AfterInlinedBB, AfterInlinedIstrIndex);
+}
+
+bool InlineSmallFunctions::inlineCallsInFunction(
+    BinaryContext &BC,
+    BinaryFunction &Function) {
+  std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
+                                         Function.layout().end());
+  std::sort(Blocks.begin(), Blocks.end(),
+      [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
+        return BB1->getExecutionCount() > BB2->getExecutionCount();
+      });
+  uint32_t ExtraSize = 0;
+
+  for (auto BB : Blocks) {
+    for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
+      auto &Inst = *InstIt;
+      if (BC.MIA->isCall(Inst)) {
+        TotalDynamicCalls += BB->getExecutionCount();
+      }
+    }
+  }
+
+  bool DidInlining = false;
+
+  for (auto BB : Blocks) {
+    if (BB->isCold())
+      continue;
+
+    for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ) {
+      auto &Inst = *InstIt;
+      if (BC.MIA->isCall(Inst) &&
+          !BC.MIA->isTailCall(Inst) &&
+          Inst.size() == 1 &&
+          Inst.getOperand(0).isExpr()) {
+        const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
+        assert(TargetSymbol && "target symbol expected for direct call");
+        const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
+        if (TargetFunction) {
+          bool CallToInlineableFunction =
+            InliningCandidates.count(TargetFunction);
+
+          TotalInlineableCalls +=
+            CallToInlineableFunction * BB->getExecutionCount();
+
+          if (CallToInlineableFunction &&
+              TargetFunction->getSize() + ExtraSize
+                + Function.estimateHotSize() < Function.getMaxSize()) {
+            auto NextInstIt = std::next(InstIt);
+            inlineCall(BC, *BB, &Inst, *TargetFunction->begin());
+            DidInlining = true;
+            DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to "
+                         << *TargetFunction << " in "
+                         << Function << "\n");
+            InstIt = NextInstIt;
+            ExtraSize += TargetFunction->getSize();
+            InlinedDynamicCalls += BB->getExecutionCount();
+            continue;
+          }
+        }
+      }
+
+      ++InstIt;
+    }
+  }
+
+  return DidInlining;
+}
+
+bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
+    BinaryContext &BC,
+    BinaryFunction &Function) {
+  std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
+                                         Function.layout().end());
+  std::sort(Blocks.begin(), Blocks.end(),
+      [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
+        return BB1->getExecutionCount() > BB2->getExecutionCount();
+      });
+  uint32_t ExtraSize = 0;
+
+  for (auto BB : Blocks) {
+    for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
+      auto &Inst = *InstIt;
+      if (BC.MIA->isCall(Inst)) {
+        TotalDynamicCalls += BB->getExecutionCount();
+      }
+    }
+  }
+
+  bool DidInlining = false;
+
+  for (auto BB : Blocks) {
+    if (BB->isCold())
+      continue;
+
+    unsigned InstIndex = 0;
+    for (auto InstIt = BB->begin(); InstIt != BB->end(); ) {
+      auto &Inst = *InstIt;
+      if (BC.MIA->isCall(Inst) &&
+          Inst.size() == 1 &&
+          Inst.getOperand(0).isExpr()) {
+        assert(!BC.MIA->isInvoke(Inst));
+        const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
+        assert(TargetSymbol && "target symbol expected for direct call");
+        const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
+        if (TargetFunction) {
+          bool CallToInlineableFunction =
+            InliningCandidates.count(TargetFunction);
+
+          TotalInlineableCalls +=
+            CallToInlineableFunction * BB->getExecutionCount();
+
+          if (CallToInlineableFunction &&
+              TargetFunction->getSize() + ExtraSize
+              + Function.estimateHotSize() < Function.getMaxSize()) {
+            unsigned NextInstIndex = 0;
+            BinaryBasicBlock *NextBB = nullptr;
+            std::tie(NextBB, NextInstIndex) =
+              inlineCall(BC, Function, BB, InstIndex, *TargetFunction);
+            DidInlining = true;
+            DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to "
+                         << *TargetFunction << " in "
+                         << Function << "\n");
+            InstIndex = NextBB == BB ? NextInstIndex : BB->size();
+            InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end();
+            ExtraSize += TargetFunction->getSize();
+            InlinedDynamicCalls += BB->getExecutionCount();
+            continue;
+          }
+        }
+      }
+
+      ++InstIndex;
+      ++InstIt;
+    }
+  }
+
+  return DidInlining;
+}
+
+bool InlineSmallFunctions::mustConsider(const BinaryFunction &BF) {
+  for (auto &Name : opts::ForceInlineFunctions) {
+    if (BF.hasName(Name))
+      return true;
+  }
+  return false;
+}
+
+void InlineSmallFunctions::runOnFunctions(
+    BinaryContext &BC,
+    std::map<uint64_t, BinaryFunction> &BFs,
+    std::set<uint64_t> &) {
+
+  if (opts::AggressiveInlining)
+    findInliningCandidatesAggressive(BC, BFs);
+  else
+    findInliningCandidates(BC, BFs);
+
+  std::vector<BinaryFunction *> ConsideredFunctions;
+  for (auto &It : BFs) {
+    auto &Function = It.second;
+    if (!shouldOptimize(Function) ||
+        (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE &&
+         !mustConsider(Function)))
+      continue;
+    ConsideredFunctions.push_back(&Function);
+  }
+  std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(),
+            [](BinaryFunction *A, BinaryFunction *B) {
+              return B->getExecutionCount() < A->getExecutionCount();
+            });
+  unsigned ModifiedFunctions = 0;
+  for (unsigned i = 0; i < ConsideredFunctions.size() &&
+                       ModifiedFunctions <= kMaxFunctions; ++i) {
+    auto &Function = *ConsideredFunctions[i];
+
+    const bool DidInline = opts::AggressiveInlining
+      ? inlineCallsInFunctionAggressive(BC, Function)
+      : inlineCallsInFunction(BC, Function);
+
+    if (DidInline) {
+      Modified.insert(&Function);
+      ++ModifiedFunctions;
+    }
+  }
+
+  DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of "
+               << TotalDynamicCalls << " function calls in the profile.\n"
+               << "BOLT-INFO: Inlined calls represent "
+               << format("%.1f",
+                         100.0 * InlinedDynamicCalls / TotalInlineableCalls)
+               << "% of all inlineable calls in the profile.\n");
+}
+
+
+} // namespace bolt
+} // namespace llvm
--- a/bolt/Passes/Inliner.h
+++ b/bolt/Passes/Inliner.h
@ -0,0 +1,102 @@
+//===--- Passes/Inliner.h - Inlining infra for BOLT -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The set of optimization/analysis passes that run on BinaryFunctions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+/// Inlining of single basic block functions.
+/// The pass currently does not handle CFI instructions. This is needed for
+/// correctness and we may break exception handling because of this.
+class InlineSmallFunctions : public BinaryFunctionPass {
+private:
+  std::set<const BinaryFunction *> InliningCandidates;
+
+  /// Maximum number of instructions in an inlined function.
+  static const unsigned kMaxInstructions = 8;
+  /// Maximum code size (in bytes) of inlined function (used by aggressive
+  /// inlining).
+  static const uint64_t kMaxSize = 60;
+  /// Maximum number of functions that will be considered for inlining (in
+  /// descending hottness order).
+  static const unsigned kMaxFunctions = 30000;
+
+  /// Statistics collected for debugging.
+  uint64_t TotalDynamicCalls = 0;
+  uint64_t InlinedDynamicCalls = 0;
+  uint64_t TotalInlineableCalls = 0;
+  std::unordered_set<const BinaryFunction *> Modified;
+
+  static bool mustConsider(const BinaryFunction &BF);
+
+  void findInliningCandidates(BinaryContext &BC,
+                              const std::map<uint64_t, BinaryFunction> &BFs);
+
+  /// Inline the call in CallInst to InlinedFunctionBB (the only BB of the
+  /// called function).
+  void inlineCall(BinaryContext &BC,
+                  BinaryBasicBlock &BB,
+                  MCInst *CallInst,
+                  const BinaryBasicBlock &InlinedFunctionBB);
+
+  bool inlineCallsInFunction(BinaryContext &BC,
+                             BinaryFunction &Function);
+
+  /// The following methods do a more aggressive inlining pass, where we
+  /// inline calls as well as tail calls and we are not limited to inlining
+  /// functions with only one basic block.
+  /// FIXME: Currently these are broken since they do not work with the split
+  /// function option.
+  void findInliningCandidatesAggressive(
+      BinaryContext &BC, const std::map<uint64_t, BinaryFunction> &BFs);
+
+  bool inlineCallsInFunctionAggressive(
+      BinaryContext &BC, BinaryFunction &Function);
+
+  /// Inline the call in CallInst to InlinedFunction. Inlined function should not
+  /// contain any landing pad or thrower edges but can have more than one blocks.
+  ///
+  /// Return the location (basic block and instruction index) where the code of
+  /// the caller function continues after the the inlined code.
+  std::pair<BinaryBasicBlock *, unsigned>
+  inlineCall(BinaryContext &BC,
+             BinaryFunction &CallerFunction,
+             BinaryBasicBlock *CallerBB,
+             const unsigned CallInstIdex,
+             const BinaryFunction &InlinedFunction);
+
+public:
+  explicit InlineSmallFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "inlining";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
--- a/bolt/Passes/ReorderAlgorithm.cpp
+++ b/bolt/Passes/ReorderAlgorithm.cpp
@ -0,0 +1,698 @@
+//===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements different basic block reordering algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReorderAlgorithm.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/CommandLine.h"
+#include <queue>
+#include <functional>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+static cl::opt<bool>
+PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore);
+
+static cl::opt<uint32_t>
+RandomSeed("bolt-seed",
+           cl::desc("seed for randomization"),
+           cl::init(42),
+           cl::ZeroOrMore);
+
+} // namespace opts
+
+namespace {
+
+template <class T>
+inline void hashCombine(size_t &Seed, const T &Val) {
+  std::hash<T> Hasher;
+  Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
+}
+
+template <typename A, typename B>
+struct HashPair {
+  size_t operator()(const std::pair<A,B>& Val) const {
+    std::hash<A> Hasher;
+    size_t Seed = Hasher(Val.first);
+    hashCombine(Seed, Val.second);
+    return Seed;
+  }
+};
+
+}
+
+void ClusterAlgorithm::computeClusterAverageFrequency() {
+  AvgFreq.resize(Clusters.size(), 0.0);
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
+    double Freq = 0.0;
+    for (auto BB : Clusters[I]) {
+      if (BB->getNumNonPseudos() > 0)
+        Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos();
+    }
+    AvgFreq[I] = Freq;
+  }
+}
+
+void ClusterAlgorithm::printClusters() const {
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
+    errs() << "Cluster number " << I;
+    if (AvgFreq.size() == Clusters.size())
+      errs() << " (frequency: " << AvgFreq[I] << ")";
+    errs() << " : ";
+    auto Sep = "";
+    for (auto BB : Clusters[I]) {
+      errs() << Sep << BB->getName();
+      Sep = ", ";
+    }
+    errs() << "\n";
+  }
+}
+
+void ClusterAlgorithm::reset() {
+  Clusters.clear();
+  ClusterEdges.clear();
+  AvgFreq.clear();
+}
+
+void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
+  OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
+}
+
+size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
+  HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
+  return Hasher(std::make_pair(E.Src, E.Dst));
+}
+
+bool GreedyClusterAlgorithm::EdgeEqual::operator()(
+    const EdgeTy &A, const EdgeTy &B) const {
+  return A.Src == B.Src && A.Dst == B.Dst;
+}
+
+void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
+                                                bool ComputeEdges) {
+  reset();
+
+  // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
+  // maximize weight during a path traversing all BBs. In this way, we will
+  // convert the hottest branches into fall-throughs.
+
+  // This is the queue of edges from which we will pop edges and use them to
+  // cluster basic blocks in a greedy fashion.
+  std::vector<EdgeTy> Queue;
+
+  // Initialize inter-cluster weights.
+  if (ComputeEdges)
+    ClusterEdges.resize(BF.layout_size());
+
+  // Initialize clusters and edge queue.
+  for (auto BB : BF.layout()) {
+    // Create a cluster for this BB.
+    uint32_t I = Clusters.size();
+    Clusters.emplace_back();
+    auto &Cluster = Clusters.back();
+    Cluster.push_back(BB);
+    BBToClusterMap[BB] = I;
+    // Populate priority queue with edges.
+    auto BI = BB->branch_info_begin();
+    for (auto &I : BB->successors()) {
+      assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+             "attempted reordering blocks of function with no profile data");
+      Queue.emplace_back(EdgeTy(BB, I, BI->Count));
+      ++BI;
+    }
+  }
+  // Sort and adjust the edge queue.
+  initQueue(Queue, BF);
+
+  // Grow clusters in a greedy fashion.
+  while (!Queue.empty()) {
+    auto E = Queue.back();
+    Queue.pop_back();
+
+    const auto *SrcBB = E.Src;
+    const auto *DstBB = E.Dst;
+
+    DEBUG(dbgs() << "Popped edge ";
+          E.print(dbgs());
+          dbgs() << "\n");
+
+    // Case 1: BBSrc and BBDst are the same. Ignore this edge
+    if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
+      DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
+      continue;
+    }
+
+    int I = BBToClusterMap[SrcBB];
+    int J = BBToClusterMap[DstBB];
+
+    // Case 2: If they are already allocated at the same cluster, just increase
+    // the weight of this cluster
+    if (I == J) {
+      if (ComputeEdges)
+        ClusterEdges[I][I] += E.Count;
+      DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
+      continue;
+    }
+
+    auto &ClusterA = Clusters[I];
+    auto &ClusterB = Clusters[J];
+    if (areClustersCompatible(ClusterA, ClusterB, E)) {
+      // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
+      // allowing us to merge two clusters.
+      for (auto BB : ClusterB)
+        BBToClusterMap[BB] = I;
+      ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
+      ClusterB.clear();
+      if (ComputeEdges) {
+        // Increase the intra-cluster edge count of cluster A with the count of
+        // this edge as well as with the total count of previously visited edges
+        // from cluster B cluster A.
+        ClusterEdges[I][I] += E.Count;
+        ClusterEdges[I][I] += ClusterEdges[J][I];
+        // Iterate through all inter-cluster edges and transfer edges targeting
+        // cluster B to cluster A.
+        for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
+          ClusterEdges[K][I] += ClusterEdges[K][J];
+      }
+      // Adjust the weights of the remaining edges and re-sort the queue.
+      adjustQueue(Queue, BF);
+      DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
+    } else {
+      // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
+      // merge them. Add the count of this edge to the inter-cluster edge count
+      // between clusters A and B to help us decide ordering between these
+      // clusters.
+      if (ComputeEdges)
+        ClusterEdges[I][J] += E.Count;
+      DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
+    }
+  }
+}
+
+void GreedyClusterAlgorithm::reset() {
+  ClusterAlgorithm::reset();
+  BBToClusterMap.clear();
+}
+
+void PHGreedyClusterAlgorithm::initQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Define a comparison function to establish SWO between edges.
+  auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
+    // With equal weights, prioritize branches with lower index
+    // source/destination. This helps to keep original block order for blocks
+    // when optimal order cannot be deducted from a profile.
+    if (A.Count == B.Count) {
+      const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
+      return (SrcOrder != 0)
+        ? SrcOrder > 0
+        : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
+    }
+    return A.Count < B.Count;
+  };
+
+  // Sort edges in increasing profile count order.
+  std::sort(Queue.begin(), Queue.end(), Comp);
+}
+
+void PHGreedyClusterAlgorithm::adjustQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Nothing to do.
+  return;
+}
+
+bool PHGreedyClusterAlgorithm::areClustersCompatible(
+    const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
+  return Front.back() == E.Src && Back.front() == E.Dst;
+}
+
+int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
+    const EdgeTy &E, const BinaryFunction &BF) const {
+  const BinaryBasicBlock *SrcBB = E.Src;
+  const BinaryBasicBlock *DstBB = E.Dst;
+
+  // Initial weight value.
+  int64_t W = (int64_t)E.Count;
+
+  // Adjust the weight by taking into account other edges with the same source.
+  auto BI = SrcBB->branch_info_begin();
+  for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
+    assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+           "attempted reordering blocks of function with no profile data");
+    assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
+           "overflow detected");
+    // Ignore edges with same source and destination, edges that target the
+    // entry block as well as the edge E itself.
+    if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
+      W -= (int64_t)BI->Count;
+    ++BI;
+  }
+
+  // Adjust the weight by taking into account other edges with the same
+  // destination.
+  for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
+    // Ignore edges with same source and destination as well as the edge E
+    // itself.
+    if (PredBB == DstBB || PredBB == SrcBB)
+      continue;
+    auto BI = PredBB->branch_info_begin();
+    for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
+      if (SuccBB == DstBB)
+        break;
+      ++BI;
+    }
+    assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
+    assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+           "attempted reordering blocks of function with no profile data");
+    assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
+           "overflow detected");
+    W -= (int64_t)BI->Count;
+  }
+
+  return W;
+}
+
+void MinBranchGreedyClusterAlgorithm::initQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Initialize edge weights.
+  for (const EdgeTy &E : Queue)
+    Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
+
+  // Sort edges in increasing weight order.
+  adjustQueue(Queue, BF);
+}
+
+void MinBranchGreedyClusterAlgorithm::adjustQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Define a comparison function to establish SWO between edges.
+  auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
+    // With equal weights, prioritize branches with lower index
+    // source/destination. This helps to keep original block order for blocks
+    // when optimal order cannot be deduced from a profile.
+    if (Weight[A] == Weight[B]) {
+      const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
+      return (SrcOrder != 0)
+        ? SrcOrder > 0
+        : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
+    }
+    return Weight[A] < Weight[B];
+  };
+
+  // Iterate through all remaining edges to find edges that have their
+  // source and destination in the same cluster.
+  std::vector<EdgeTy> NewQueue;
+  for (const EdgeTy &E : Queue) {
+    const auto *SrcBB = E.Src;
+    const auto *DstBB = E.Dst;
+
+    // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
+    // this edge.
+    if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
+      DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
+            E.print(dbgs());
+            dbgs() << " (same src, dst)\n");
+      continue;
+    }
+
+    int I = BBToClusterMap[SrcBB];
+    int J = BBToClusterMap[DstBB];
+    auto &ClusterA = Clusters[I];
+    auto &ClusterB = Clusters[J];
+
+    // Case 2: They are already allocated at the same cluster or incompatible
+    // clusters. Adjust the weights of edges with the same source or
+    // destination, so that this edge has no effect on them any more, and ignore
+    // this edge. Also increase the intra- (or inter-) cluster edge count.
+    if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
+      if (!ClusterEdges.empty())
+        ClusterEdges[I][J] += E.Count;
+      DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
+            E.print(dbgs());
+            dbgs() << " (src, dst belong to same cluster or incompatible "
+                      "clusters)\n");
+      for (const auto *SuccBB : SrcBB->successors()) {
+        if (SuccBB == DstBB)
+          continue;
+        auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
+        assert(WI != Weight.end() && "CFG edge not found in Weight map");
+        WI->second += (int64_t)E.Count;
+      }
+      for (const auto *PredBB : DstBB->predecessors()) {
+        if (PredBB == SrcBB)
+          continue;
+        auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
+        assert(WI != Weight.end() && "CFG edge not found in Weight map");
+        WI->second += (int64_t)E.Count;
+      }
+      continue;
+    }
+
+    // Case 3: None of the previous cases is true, so just keep this edge in
+    // the queue.
+    NewQueue.emplace_back(E);
+  }
+
+  // Sort remaining edges in increasing weight order.
+  Queue.swap(NewQueue);
+  std::sort(Queue.begin(), Queue.end(), Comp);
+}
+
+bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
+    const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
+  return Front.back() == E.Src && Back.front() == E.Dst;
+}
+
+void MinBranchGreedyClusterAlgorithm::reset() {
+  GreedyClusterAlgorithm::reset();
+  Weight.clear();
+}
+
+void OptimalReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  std::vector<std::vector<uint64_t>> Weight;
+  std::unordered_map<const BinaryBasicBlock *, int> BBToIndex;
+  std::vector<BinaryBasicBlock *> IndexToBB;
+
+  unsigned N = BF.layout_size();
+  // Populating weight map and index map
+  for (auto BB : BF.layout()) {
+    BBToIndex[BB] = IndexToBB.size();
+    IndexToBB.push_back(BB);
+  }
+  Weight.resize(N);
+  for (auto BB : BF.layout()) {
+    auto BI = BB->branch_info_begin();
+    Weight[BBToIndex[BB]].resize(N);
+    for (auto I : BB->successors()) {
+      if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
+        Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
+      ++BI;
+    }
+  }
+
+  std::vector<std::vector<int64_t>> DP;
+  DP.resize(1 << N);
+  for (auto &Elmt : DP) {
+    Elmt.resize(N, -1);
+  }
+  // Start with the entry basic block being allocated with cost zero
+  DP[1][0] = 0;
+  // Walk through TSP solutions using a bitmask to represent state (current set
+  // of BBs in the layout)
+  unsigned BestSet = 1;
+  unsigned BestLast = 0;
+  int64_t BestWeight = 0;
+  for (unsigned Set = 1; Set < (1U << N); ++Set) {
+    // Traverse each possibility of Last BB visited in this layout
+    for (unsigned Last = 0; Last < N; ++Last) {
+      // Case 1: There is no possible layout with this BB as Last
+      if (DP[Set][Last] == -1)
+        continue;
+
+      // Case 2: There is a layout with this Set and this Last, and we try
+      // to expand this set with New
+      for (unsigned New = 1; New < N; ++New) {
+        // Case 2a: BB "New" is already in this Set
+        if ((Set & (1 << New)) != 0)
+          continue;
+
+        // Case 2b: BB "New" is not in this set and we add it to this Set and
+        // record total weight of this layout with "New" as the last BB.
+        unsigned NewSet = (Set | (1 << New));
+        if (DP[NewSet][New] == -1)
+          DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
+        DP[NewSet][New] = std::max(DP[NewSet][New],
+                                   DP[Set][Last] + (int64_t)Weight[Last][New]);
+
+        if (DP[NewSet][New] > BestWeight) {
+          BestWeight = DP[NewSet][New];
+          BestSet = NewSet;
+          BestLast = New;
+        }
+      }
+    }
+  }
+
+  // Define final function layout based on layout that maximizes weight
+  unsigned Last = BestLast;
+  unsigned Set = BestSet;
+  std::vector<bool> Visited;
+  Visited.resize(N);
+  Visited[Last] = true;
+  Order.push_back(IndexToBB[Last]);
+  Set = Set & ~(1U << Last);
+  while (Set != 0) {
+    int64_t Best = -1;
+    for (unsigned I = 0; I < N; ++I) {
+      if (DP[Set][I] == -1)
+        continue;
+      if (DP[Set][I] > Best) {
+        Last = I;
+        Best = DP[Set][I];
+      }
+    }
+    Visited[Last] = true;
+    Order.push_back(IndexToBB[Last]);
+    Set = Set & ~(1U << Last);
+  }
+  std::reverse(Order.begin(), Order.end());
+
+  // Finalize layout with BBs that weren't assigned to the layout
+  for (auto BB : BF.layout()) {
+    if (Visited[BBToIndex[BB]] == false)
+      Order.push_back(BB);
+  }
+}
+
+void OptimizeReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF);
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Arrange basic blocks according to clusters.
+  for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+}
+
+void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
+  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+  auto &ClusterEdges = CAlgo->ClusterEdges;
+
+  // Compute clusters' average frequencies.
+  CAlgo->computeClusterAverageFrequency();
+  std::vector<double> &AvgFreq = CAlgo->AvgFreq;
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Cluster layout order
+  std::vector<uint32_t> ClusterOrder;
+
+  // Do a topological sort for clusters, prioritizing frequently-executed BBs
+  // during the traversal.
+  std::stack<uint32_t> Stack;
+  std::vector<uint32_t> Status;
+  std::vector<uint32_t> Parent;
+  Status.resize(Clusters.size(), 0);
+  Parent.resize(Clusters.size(), 0);
+  constexpr uint32_t STACKED = 1;
+  constexpr uint32_t VISITED = 2;
+  Status[0] = STACKED;
+  Stack.push(0);
+  while (!Stack.empty()) {
+    uint32_t I = Stack.top();
+    if (!(Status[I] & VISITED)) {
+      Status[I] |= VISITED;
+      // Order successors by weight
+      auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
+        return ClusterEdges[I][A] > ClusterEdges[I][B];
+      };
+      std::priority_queue<uint32_t, std::vector<uint32_t>,
+                          decltype(ClusterComp)> SuccQueue(ClusterComp);
+      for (auto &Target: ClusterEdges[I]) {
+        if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
+            !Clusters[Target.first].empty()) {
+          Parent[Target.first] = I;
+          Status[Target.first] = STACKED;
+          SuccQueue.push(Target.first);
+        }
+      }
+      while (!SuccQueue.empty()) {
+        Stack.push(SuccQueue.top());
+        SuccQueue.pop();
+      }
+      continue;
+    }
+    // Already visited this node
+    Stack.pop();
+    ClusterOrder.push_back(I);
+  }
+  std::reverse(ClusterOrder.begin(), ClusterOrder.end());
+  // Put unreachable clusters at the end
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+    if (!(Status[I] & VISITED) && !Clusters[I].empty())
+      ClusterOrder.push_back(I);
+
+  // Sort nodes with equal precedence
+  auto Beg = ClusterOrder.begin();
+  // Don't reorder the first cluster, which contains the function entry point
+  ++Beg;
+  std::stable_sort(Beg, ClusterOrder.end(),
+                   [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
+                     uint32_t P = Parent[A];
+                     while (Parent[P] != 0) {
+                       if (Parent[P] == B)
+                         return false;
+                       P = Parent[P];
+                     }
+                     P = Parent[B];
+                     while (Parent[P] != 0) {
+                       if (Parent[P] == A)
+                         return true;
+                       P = Parent[P];
+                     }
+                     return AvgFreq[A] > AvgFreq[B];
+                   });
+
+  if (opts::PrintClusters) {
+    errs() << "New cluster order: ";
+    auto Sep = "";
+    for (auto O : ClusterOrder) {
+      errs() << Sep << O;
+      Sep = ", ";
+    }
+    errs() << '\n';
+  }
+
+  // Arrange basic blocks according to cluster order.
+  for (uint32_t ClusterIndex : ClusterOrder) {
+    ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+  }
+}
+
+void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF);
+  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+
+  // Compute clusters' average frequencies.
+  CAlgo->computeClusterAverageFrequency();
+  std::vector<double> &AvgFreq = CAlgo->AvgFreq;
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Cluster layout order
+  std::vector<uint32_t> ClusterOrder;
+
+  // Order clusters based on average instruction execution frequency
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+    if (!Clusters[I].empty())
+      ClusterOrder.push_back(I);
+  // Don't reorder the first cluster, which contains the function entry point
+  std::stable_sort(std::next(ClusterOrder.begin()),
+                   ClusterOrder.end(),
+                   [&AvgFreq](uint32_t A, uint32_t B) {
+                     return AvgFreq[A] > AvgFreq[B];
+                   });
+
+  if (opts::PrintClusters) {
+    errs() << "New cluster order: ";
+    auto Sep = "";
+    for (auto O : ClusterOrder) {
+      errs() << Sep << O;
+      Sep = ", ";
+    }
+    errs() << '\n';
+  }
+
+  // Arrange basic blocks according to cluster order.
+  for (uint32_t ClusterIndex : ClusterOrder) {
+    ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+  }
+}
+
+void ReverseReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  auto FirstBB = *BF.layout_begin();
+  Order.push_back(FirstBB);
+  for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
+    Order.push_back(*RLI);
+}
+
+
+void RandomClusterReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF);
+  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Cluster layout order
+  std::vector<uint32_t> ClusterOrder;
+
+  // Order clusters based on average instruction execution frequency
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+    if (!Clusters[I].empty())
+      ClusterOrder.push_back(I);
+
+  std::srand(opts::RandomSeed);
+  std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end());
+
+  if (opts::PrintClusters) {
+    errs() << "New cluster order: ";
+    auto Sep = "";
+    for (auto O : ClusterOrder) {
+      errs() << Sep << O;
+      Sep = ", ";
+    }
+    errs() << '\n';
+  }
+
+  // Arrange basic blocks according to cluster order.
+  for (uint32_t ClusterIndex : ClusterOrder) {
+    ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+  }
+}
--- a/bolt/Passes/ReorderAlgorithm.h
+++ b/bolt/Passes/ReorderAlgorithm.h
@ -0,0 +1,268 @@
+// Passes/ReorderAlgorithm.h - Interface for basic block reorderng algorithms //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to different basic block reordering algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
+
+#include "BinaryFunction.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <unordered_map>
+#include <memory>
+#include <vector>
+
+
+namespace llvm {
+
+class raw_ostream;
+
+
+namespace bolt {
+
+class BinaryBasicBlock;
+class BinaryFunction;
+
+/// Objects of this class implement various basic block clustering algorithms.
+/// Basic block clusters are chains of basic blocks that should be laid out
+/// in this order to maximize performace. These algorithms group basic blocks
+/// into clusters using execution profile data and various heuristics.
+class ClusterAlgorithm {
+public:
+  using ClusterTy = std::vector<BinaryBasicBlock *>;
+  std::vector<ClusterTy> Clusters;
+  std::vector<std::unordered_map<uint32_t, uint64_t>> ClusterEdges;
+  std::vector<double> AvgFreq;
+
+  /// Group the basic blocks in the given function into clusters stored in the
+  /// Clusters vector. Also encode relative weights between two clusters in
+  /// the ClusterEdges vector if requested. This vector is indexed by
+  /// the clusters indices in the Clusters vector.
+  virtual void clusterBasicBlocks(const BinaryFunction &BF,
+                                  bool ComputeEdges = false) = 0;
+
+  /// Compute for each cluster its averagae execution frequency, that is
+  /// the sum of average frequencies of its blocks (execution count / # instrs).
+  /// The average frequencies are stored in the AvgFreq vector, index by the
+  /// cluster indices in the Clusters vector.
+  void computeClusterAverageFrequency();
+
+  /// Clear clusters and related info.
+  virtual void reset();
+
+  void printClusters() const;
+
+  virtual ~ClusterAlgorithm() {}
+};
+
+/// Base class for a greedy clustering algorithm that selects edges in order
+/// based on some heuristic and uses them to join basic blocks into clusters.
+class GreedyClusterAlgorithm : public ClusterAlgorithm {
+protected:
+  // Represents an edge between two basic blocks, with source, destination, and
+  // profile count.
+  struct EdgeTy {
+    const BinaryBasicBlock *Src;
+    const BinaryBasicBlock *Dst;
+    uint64_t Count;
+
+    EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst,
+           uint64_t Count) :
+      Src(Src), Dst(Dst), Count(Count) {}
+
+    void print(raw_ostream &OS) const;
+  };
+
+  struct EdgeHash {
+   size_t operator() (const EdgeTy &E) const;
+  };
+
+  struct EdgeEqual {
+    bool operator() (const EdgeTy &A, const EdgeTy &B) const;
+  };
+
+  // Virtual methods that allow custom specialization of the heuristic used by
+  // the algorithm to select edges.
+  virtual void initQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
+  virtual void adjustQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
+  virtual bool areClustersCompatible(
+      const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0;
+
+  // Map from basic block to owning cluster index.
+  using BBToClusterMapTy = std::unordered_map<const BinaryBasicBlock *,
+                                              unsigned>;
+  BBToClusterMapTy BBToClusterMap;
+
+public:
+  void clusterBasicBlocks(const BinaryFunction &BF,
+                          bool ComputeEdges = false) override;
+  void reset() override;
+};
+
+
+/// This clustering algorithm is based on a greedy heuristic suggested by
+/// Pettis and Hansen (PLDI '90).
+class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
+protected:
+  void initQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF)  override;
+  void adjustQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
+  bool areClustersCompatible(
+      const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
+  override;
+};
+
+
+/// This clustering algorithm is based on a greedy heuristic that is a
+/// modification of the heuristic suggested by Pettis (PLDI '90). It is
+/// geared towards minimizing branches.
+class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
+private:
+  // Map from an edge to its weight which is used by the algorithm to sort the
+  // edges.
+  std::unordered_map<EdgeTy, int64_t, EdgeHash, EdgeEqual> Weight;
+
+  // The weight of an edge is calculated as the win in branches if we choose
+  // to layout this edge as a fall-through. For example, consider the edges
+  //  A -> B with execution count 500,
+  //  A -> C with execution count 100, and
+  //  D -> B with execution count 150
+  // wher B, C are the only successors of A and A, D are thr only predessecors
+  // of B. Then if we choose to layout edge A -> B as a fallthrough, the win in
+  // branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B.
+  int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const;
+
+protected:
+  void initQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF)  override;
+  void adjustQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
+  bool areClustersCompatible(
+      const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
+  override;
+
+public:
+  void reset() override;
+};
+
+
+/// Objects of this class implement various basic block reordering alogrithms.
+/// Most of these algorithms depend on a clustering alogrithm.
+/// Here we have 3 conflicting goals as to how to layout clusters. If we want
+/// to minimize jump offsets, we should put clusters with heavy inter-cluster
+/// dependence as close as possible. If we want to maximize the probability
+/// that all inter-cluster edges are predicted as not-taken, we should enforce
+/// a topological order to make targets appear after sources, creating forward
+/// branches. If we want to separate hot from cold blocks to maximize the
+/// probability that unfrequently executed code doesn't pollute the cache, we
+/// should put clusters in descending order of hotness.
+class ReorderAlgorithm {
+protected:
+  std::unique_ptr<ClusterAlgorithm> CAlgo;
+
+public:
+  ReorderAlgorithm() { }
+  explicit ReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    CAlgo(std::move(CAlgo)) { }
+
+  using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
+
+  /// Reorder the basic blocks of the given function and store the new order in
+  /// the new Clusters vector.
+  virtual void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const = 0;
+
+  void setClusterAlgorithm(ClusterAlgorithm *CAlgo) {
+    this->CAlgo.reset(CAlgo);
+  }
+
+  virtual ~ReorderAlgorithm() { }
+};
+
+
+/// Dynamic programming implementation for the TSP, applied to BB layout. Find
+/// the optimal way to maximize weight during a path traversing all BBs. In
+/// this way, we will convert the hottest branches into fall-throughs.
+///
+/// Uses exponential amount of memory on the number of basic blocks and should
+/// only be used for small functions.
+class OptimalReorderAlgorithm : public ReorderAlgorithm {
+public:
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// Simple algorithm that groups basic blocks into clusters and then
+/// lays them out cluster after cluster.
+class OptimizeReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit OptimizeReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// This reorder algorithm tries to ensure that all inter-cluster edges are
+/// predicted as not-taken, by enforcing a topological order to make
+/// targets appear after sources, creating forward branches.
+class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit OptimizeBranchReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// This reorder tries to separate hot from cold blocks to maximize the
+/// probability that unfrequently executed code doesn't pollute the cache, by
+/// putting clusters in descending order of hotness.
+class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit OptimizeCacheReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// Toy example that simply reverses the original basic block order.
+class ReverseReorderAlgorithm : public ReorderAlgorithm {
+public:
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+/// Create clusters as usual and place them in random order.
+class RandomClusterReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit RandomClusterReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif