HFSort/call graph refactoring

Summary: I've factored out the call graph code from dataflow and function reordering code and done a few small renames/cleanups. I've also moved the function reordering pass into a separate file because it was starting to get big. I've got more refactoring planned for hfsort/call graph but this is a start. (cherry picked from FBD5140771)
2017-05-26 12:53:21 -07:00 · 2017-05-26 12:53:21 -07:00 · 733e8c464f
parent 9b190cc74b
commit 733e8c464f
16 changed files with 1139 additions and 1072 deletions
--- a/bolt/BinaryPassManager.cpp
+++ b/bolt/BinaryPassManager.cpp
@ -13,6 +13,7 @@
 #include "Passes/FrameOptimizer.h"
 #include "Passes/IndirectCallPromotion.h"
 #include "Passes/Inliner.h"
+#include "Passes/ReorderFunctions.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <numeric>
--- a/bolt/Passes/BinaryPasses.cpp
+++ b/bolt/Passes/BinaryPasses.cpp
@ -10,11 +10,8 @@
 //===----------------------------------------------------------------------===//

 #include "BinaryPasses.h"
-#include "HFSort.h"
 #include "llvm/Support/Options.h"

-#include <fstream>
-
 #define DEBUG_TYPE "bolt"

 using namespace llvm;
@ -52,11 +49,9 @@ namespace opts {
 extern cl::OptionCategory BoltOptCategory;

 extern cl::opt<unsigned> Verbosity;
-extern cl::opt<uint32_t> RandomSeed;
 extern cl::opt<bool> Relocs;
 extern cl::opt<bolt::BinaryFunction::SplittingType> SplitFunctions;
 extern bool shouldProcess(const bolt::BinaryFunction &Function);
-extern size_t padFunction(const bolt::BinaryFunction &Function);

 enum DynoStatsSortOrder : char {
  Ascending,
@ -71,18 +66,6 @@ DynoStatsSortOrderOpt("print-sorted-by-order",
  cl::init(DynoStatsSortOrder::Descending),
  cl::cat(BoltOptCategory));

-static cl::opt<std::string>
-FunctionOrderFile("function-order",
-  cl::desc("file containing an ordered list of functions to use for function "
-           "reordering"),
-  cl::cat(BoltOptCategory));
-
-static cl::opt<std::string>
-GenerateFunctionOrderFile("generate-function-order",
-  cl::desc("file to dump the ordered list of functions to use for function "
-           "reordering"),
-  cl::cat(BoltOptCategory));
-
 static cl::opt<bool>
 ICFUseDFS("icf-dfs",
  cl::desc("use DFS ordering when using -icf option"),
@ -143,41 +126,6 @@ ReorderBlocks("reorder-blocks",
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));

-cl::opt<bolt::BinaryFunction::ReorderType>
-ReorderFunctions("reorder-functions",
-  cl::desc("reorder and cluster functions (works only with relocations)"),
-  cl::init(bolt::BinaryFunction::RT_NONE),
-  cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE,
-      "none",
-      "do not reorder functions"),
-    clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT,
-      "exec-count",
-      "order by execution count"),
-    clEnumValN(bolt::BinaryFunction::RT_HFSORT,
-      "hfsort",
-      "use hfsort algorithm"),
-    clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS,
-      "hfsort+",
-      "use hfsort+ algorithm"),
-    clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN,
-      "pettis-hansen",
-      "use Pettis-Hansen algorithm"),
-    clEnumValN(bolt::BinaryFunction::RT_RANDOM,
-      "random",
-      "reorder functions randomly"),
-    clEnumValN(bolt::BinaryFunction::RT_USER,
-      "user",
-      "use function order specified by -function-order"),
-    clEnumValEnd),
-  cl::cat(BoltOptCategory));
-
-static cl::opt<bool>
-ReorderFunctionsUseHotSize("reorder-functions-use-hot-size",
-  cl::desc("use a function's hot size when doing clustering"),
-  cl::init(true),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
 enum SctcModes : char {
  SctcAlways,
  SctcPreserveDirection,
@ -200,13 +148,6 @@ SctcMode("sctc-mode",
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));

-static cl::opt<bool>
-UseEdgeCounts("use-edge-counts",
-  cl::desc("use edge count data when doing clustering"),
-  cl::init(true),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
 } // namespace opts

 namespace llvm {
@ -1177,424 +1118,5 @@ void StripRepRet::runOnFunctions(
  }
 }

-void ReorderFunctions::buildCallGraph(BinaryContext &BC,
-                                      std::map<uint64_t, BinaryFunction> &BFs) {
-  // Add call graph nodes.
-  auto lookupNode = [&](BinaryFunction *Function) {
-    auto It = FuncToTargetId.find(Function);
-    if (It == FuncToTargetId.end()) {
-      // It's ok to use the hot size here when the function is split.  This is
-      // because emitFunctions will emit the hot part first in the order that is
-      // computed by ReorderFunctions.  The cold part will be emitted with the
-      // rest of the cold functions and code.
-      const auto Size = opts::ReorderFunctionsUseHotSize && Function->isSplit()
-        ? Function->estimateHotSize()
-        : Function->estimateSize();
-      const auto Id = Cg.addTarget(Size);
-      assert(size_t(Id) == Funcs.size());
-      Funcs.push_back(Function);
-      FuncToTargetId[Function] = Id;
-      // NOTE: for functions without a profile, we set the number of samples
-      // to zero.  This will keep these functions from appearing in the hot
-      // section.  This is a little weird because we wouldn't be trying to
-      // create a node for a function unless it was the target of a call from
-      // a hot block.  The alternative would be to set the count to one or
-      // accumulate the number of calls from the callsite into the function
-      // samples.  Results from perfomance testing seem to favor the zero
-      // count though, so I'm leaving it this way for now.
-      Cg.Targets[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0;
-      assert(Funcs[Id] == Function);
-      return Id;
-    } else {
-      return It->second;
-    }
-  };
-
-  // Add call graph edges.
-  uint64_t NotProcessed = 0;
-  uint64_t TotalCalls = 0;
-  for (auto &It : BFs) {
-    auto *Function = &It.second;
-
-    if(!shouldOptimize(*Function) || !Function->hasProfile()) {
-      continue;
-    }
-
-    auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames());
-    const auto SrcId = lookupNode(Function);
-    uint64_t Offset = Function->getAddress();
-
-    auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
-      if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) {
-        const auto DstId = lookupNode(DstFunc);
-        auto &A = Cg.incArcWeight(SrcId, DstId, Count);
-        if (!opts::UseEdgeCounts) {
-          A.AvgCallOffset += (Offset - DstFunc->getAddress());
-        }
-        DEBUG(dbgs() << "BOLT-DEBUG: Reorder functions: call " << *Function
-                     << " -> " << *DstFunc << " @ " << Offset << "\n");
-        return true;
-      }
-      return false;
-    };
-
-    for (auto *BB : Function->layout()) {
-      // Don't count calls from cold blocks
-      if (BB->isCold())
-        continue;
-
-      for (auto &Inst : *BB) {
-        // Find call instructions and extract target symbols from each one.
-        if (BC.MIA->isCall(Inst)) {
-          ++TotalCalls;
-          if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) {
-            // For direct calls, just use the BB execution count.
-            assert(BB->hasProfile());
-            const auto Count = opts::UseEdgeCounts ? BB->getExecutionCount() : 1;
-            if (!recordCall(DstSym, Count))
-              ++NotProcessed;
-          } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) {
-            // For indirect calls and jump tables, use branch data.
-            assert(BranchDataOrErr);
-            const FuncBranchData &BranchData = BranchDataOrErr.get();
-            const auto DataOffset =
-              BC.MIA->getAnnotationAs<uint64_t>(Inst, "EdgeCountData");
-
-            for (const auto &BI : BranchData.getBranchRange(DataOffset)) {
-              // Count each target as a separate call.
-              ++TotalCalls;
-
-              if (!BI.To.IsSymbol) {
-                ++NotProcessed;
-                continue;
-              }
-
-              auto Itr = BC.GlobalSymbols.find(BI.To.Name);
-              if (Itr == BC.GlobalSymbols.end()) {
-                ++NotProcessed;
-                continue;
-              }
-
-              const auto *DstSym =
-                BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat");
-
-              if (!recordCall(DstSym, opts::UseEdgeCounts ? BI.Branches : 1))
-                ++NotProcessed;
-            }
-          }
-        }
-
-        if (!opts::UseEdgeCounts) {
-          Offset += BC.computeCodeSize(&Inst, &Inst + 1);
-        }
-      }
-    }
-  }
-  outs() << "BOLT-WARNING: ReorderFunctions: " << NotProcessed
-         << " callsites not processed out of " << TotalCalls << "\n";
-
-  // Normalize arc weights.
-  if (!opts::UseEdgeCounts) {
-    for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) {
-      auto& Func = Cg.Targets[FuncId];
-      for (auto Caller : Func.Preds) {
-        auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
-        A.NormalizedWeight = A.Weight / Func.Samples;
-        A.AvgCallOffset /= A.Weight;
-        assert(A.AvgCallOffset < Cg.Targets[Caller].Size);
-      }
-    }
-  } else {
-    for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) {
-      auto &Func = Cg.Targets[FuncId];
-      for (auto Caller : Func.Preds) {
-        auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
-        A.NormalizedWeight = A.Weight / Func.Samples;
-      }
-    }
-  }
-}
-
-void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
-                               std::map<uint64_t, BinaryFunction> &BFs) {
-  std::vector<uint64_t> FuncAddr(Cg.Targets.size());  // Just for computing stats
-  uint64_t TotalSize = 0;
-  uint32_t Index = 0;
-
-  // Set order of hot functions based on clusters.
-  for (const auto& Cluster : Clusters) {
-    for (const auto FuncId : Cluster.Targets) {
-      assert(Cg.Targets[FuncId].Samples > 0);
-      Funcs[FuncId]->setIndex(Index++);
-      FuncAddr[FuncId] = TotalSize;
-      TotalSize += Cg.Targets[FuncId].Size;
-    }
-  }
-
-  if (opts::ReorderFunctions == BinaryFunction::RT_NONE)
-    return;
-
-  if (opts::Verbosity == 0) {
-#ifndef NDEBUG
-    if (!DebugFlag || !isCurrentDebugType("hfsort"))
-      return;
-#else
-    return;
-#endif
-  }
-
-  TotalSize   = 0;
-  uint64_t CurPage     = 0;
-  uint64_t Hotfuncs    = 0;
-  double TotalDistance = 0;
-  double TotalCalls    = 0;
-  double TotalCalls64B = 0;
-  double TotalCalls4KB = 0;
-  double TotalCalls2MB = 0;
-  dbgs() << "============== page 0 ==============\n";
-  for (auto& Cluster : Clusters) {
-    dbgs() <<
-      format("-------- density = %.3lf (%u / %u) --------\n",
-             (double) Cluster.Samples / Cluster.Size,
-             Cluster.Samples, Cluster.Size);
-
-    for (auto FuncId : Cluster.Targets) {
-      if (Cg.Targets[FuncId].Samples > 0) {
-        Hotfuncs++;
-
-        dbgs() << "BOLT-INFO: hot func " << *Funcs[FuncId]
-               << " (" << Cg.Targets[FuncId].Size << ")\n";
-
-        uint64_t Dist = 0;
-        uint64_t Calls = 0;
-        for (auto Dst : Cg.Targets[FuncId].Succs) {
-          auto& A = *Cg.Arcs.find(Arc(FuncId, Dst));
-          auto D =
-            std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset));
-          auto W = A.Weight;
-          Calls += W;
-          if (D < 64)        TotalCalls64B += W;
-          if (D < 4096)      TotalCalls4KB += W;
-          if (D < (2 << 20)) TotalCalls2MB += W;
-          Dist += A.Weight * D;
-          dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
-                           "weight = %.0lf, callDist = %f\n",
-                           A.Src, FuncAddr[A.Src], A.AvgCallOffset,
-                           A.Dst, FuncAddr[A.Dst], A.Weight, D);
-        }
-        TotalCalls += Calls;
-        TotalDistance += Dist;
-        dbgs() << format("start = %6u : avgCallDist = %lu : %s\n",
-                         TotalSize,
-                         Calls ? Dist / Calls : 0,
-                         Funcs[FuncId]->getPrintName().c_str());
-        TotalSize += Cg.Targets[FuncId].Size;
-        auto NewPage = TotalSize / PageSize;
-        if (NewPage != CurPage) {
-          CurPage = NewPage;
-          dbgs() << format("============== page %u ==============\n", CurPage);
-        }
-      }
-    }
-  }
-  dbgs() << format("  Number of hot functions: %u\n"
-                   "  Number of clusters: %lu\n",
-                   Hotfuncs, Clusters.size())
-         << format("  Final average call distance = %.1lf (%.0lf / %.0lf)\n",
-                   TotalCalls ? TotalDistance / TotalCalls : 0,
-                   TotalDistance, TotalCalls)
-         << format("  Total Calls = %.0lf\n", TotalCalls);
-  if (TotalCalls) {
-    dbgs() << format("  Total Calls within 64B = %.0lf (%.2lf%%)\n",
-                     TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
-           << format("  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
-                     TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
-           << format("  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
-                     TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
-  }
-}
-
-namespace {
-
-std::vector<std::string> readFunctionOrderFile() {
-  std::vector<std::string> FunctionNames;
-  std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
-  if (!FuncsFile) {
-    errs() << "Ordered functions file \"" << opts::FunctionOrderFile
-           << "\" can't be opened.\n";
-    exit(1);
-  }
-  std::string FuncName;
-  while (std::getline(FuncsFile, FuncName)) {
-    FunctionNames.push_back(FuncName);
-  }
-  return FunctionNames;
-}
-
-}
-
-void ReorderFunctions::runOnFunctions(BinaryContext &BC,
-                                      std::map<uint64_t, BinaryFunction> &BFs,
-                                      std::set<uint64_t> &LargeFunctions) {
-  if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) {
-    errs() << "BOLT-ERROR: Function reordering only works when "
-           << "relocs are enabled.\n";
-    exit(1);
-  }
-
-  if (opts::ReorderFunctions != BinaryFunction::RT_NONE &&
-      opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT &&
-      opts::ReorderFunctions != BinaryFunction::RT_USER) {
-    buildCallGraph(BC, BFs);
-  }
-
-  std::vector<Cluster> Clusters;
-
-  switch(opts::ReorderFunctions) {
-  case BinaryFunction::RT_NONE:
-    break;
-  case BinaryFunction::RT_EXEC_COUNT:
-    {
-      std::vector<BinaryFunction *> SortedFunctions(BFs.size());
-      uint32_t Index = 0;
-      std::transform(BFs.begin(),
-                     BFs.end(),
-                     SortedFunctions.begin(),
-                     [](std::pair<const uint64_t, BinaryFunction> &BFI) {
-                       return &BFI.second;
-                     });
-      std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(),
-                       [&](const BinaryFunction *A, const BinaryFunction *B) {
-                         if (!opts::shouldProcess(*A))
-                           return false;
-                         const auto PadA = opts::padFunction(*A);
-                         const auto PadB = opts::padFunction(*B);
-                         if (!PadA || !PadB) {
-                           if (PadA)
-                             return true;
-                           if (PadB)
-                             return false;
-                         }
-                         return !A->hasProfile() &&
-                           (B->hasProfile() ||
-                            (A->getExecutionCount() > B->getExecutionCount()));
-                       });
-      for (auto *BF : SortedFunctions) {
-        if (BF->hasProfile())
-          BF->setIndex(Index++);
-      }
-    }
-    break;
-  case BinaryFunction::RT_HFSORT:
-    Clusters = clusterize(Cg);
-    break;
-  case BinaryFunction::RT_HFSORT_PLUS:
-    Clusters = hfsortPlus(Cg);
-    break;
-  case BinaryFunction::RT_PETTIS_HANSEN:
-    Clusters = pettisAndHansen(Cg);
-    break;
-  case BinaryFunction::RT_RANDOM:
-    std::srand(opts::RandomSeed);
-    Clusters = randomClusters(Cg);
-    break;
-  case BinaryFunction::RT_USER:
-    {
-      uint32_t Index = 0;
-      for (const auto &Function : readFunctionOrderFile()) {
-        std::vector<uint64_t> FuncAddrs;
-
-        auto Itr = BC.GlobalSymbols.find(Function);
-        if (Itr == BC.GlobalSymbols.end()) {
-          uint32_t LocalID = 1;
-          while(1) {
-            // If we can't find the main symbol name, look for alternates.
-            Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID));
-            if (Itr != BC.GlobalSymbols.end())
-              FuncAddrs.push_back(Itr->second);
-            else
-              break;
-            LocalID++;
-          }
-        } else {
-          FuncAddrs.push_back(Itr->second);
-        }
-
-        if (FuncAddrs.empty()) {
-          errs() << "BOLT-WARNING: Reorder functions: can't find function for "
-                 << Function << ".\n";
-          continue;
-        }
-
-        for (const auto FuncAddr : FuncAddrs) {
-          const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat");
-          assert(FuncSym);
-
-          auto *BF = BC.getFunctionForSymbol(FuncSym);
-          if (!BF) {
-            errs() << "BOLT-WARNING: Reorder functions: can't find function for "
-                   << Function << ".\n";
-            break;
-          }
-          if (!BF->hasValidIndex()) {
-            BF->setIndex(Index++);
-          } else if (opts::Verbosity > 0) {
-            errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n";
-          }
-        }
-      }
-    }
-    break;
-  }
-
-  reorder(std::move(Clusters), BFs);
-
-  if (!opts::GenerateFunctionOrderFile.empty()) {
-    std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out);
-    if (!FuncsFile) {
-      errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile
-             << "\" can't be opened.\n";
-      exit(1);
-    }
-
-    std::vector<BinaryFunction *> SortedFunctions(BFs.size());
-
-    std::transform(BFs.begin(),
-                   BFs.end(),
-                   SortedFunctions.begin(),
-                   [](std::pair<const uint64_t, BinaryFunction> &BFI) {
-                     return &BFI.second;
-                   });
-
-    // Sort functions by index.
-    std::stable_sort(
-      SortedFunctions.begin(),
-      SortedFunctions.end(),
-      [](const BinaryFunction *A, const BinaryFunction *B) {
-        if (A->hasValidIndex() && B->hasValidIndex()) {
-          return A->getIndex() < B->getIndex();
-        } else if (A->hasValidIndex() && !B->hasValidIndex()) {
-          return true;
-        } else if (!A->hasValidIndex() && B->hasValidIndex()) {
-          return false;
-        } else {
-          return A->getAddress() < B->getAddress();
-        }
-      });
-
-    for (const auto *Func : SortedFunctions) {
-      if (!Func->hasValidIndex())
-        break;
-      FuncsFile << Func->getSymbol()->getName().data() << "\n";
-    }
-    FuncsFile.close();
-
-    outs() << "BOLT-INFO: dumped function order to \""
-           << opts::GenerateFunctionOrderFile << "\"\n";
-
-    exit(0);
-  }
-}
-
 } // namespace bolt
 } // namespace llvm
--- a/bolt/Passes/BinaryPasses.h
+++ b/bolt/Passes/BinaryPasses.h
@ -18,6 +18,7 @@
 #include "BinaryFunction.h"
 #include "HFSort.h"
 #include "llvm/Support/CommandLine.h"
+
 #include <map>
 #include <set>
 #include <string>
@ -358,29 +359,6 @@ public:
                      std::set<uint64_t> &LargeFunctions) override;
 };

-/// Modify function order for streaming based on hotness.
-class ReorderFunctions : public BinaryFunctionPass {
-  static constexpr uint32_t PageSize = 2 << 20;
-  std::vector<BinaryFunction *> Funcs;
-  std::unordered_map<const BinaryFunction *, TargetId> FuncToTargetId;
-  TargetGraph Cg;
-
-  void buildCallGraph(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs);
-  void reorder(std::vector<Cluster> &&Clusters,
-               std::map<uint64_t, BinaryFunction> &BFs);
- public:
-  explicit ReorderFunctions(const cl::opt<bool> &PrintPass)
-    : BinaryFunctionPass(PrintPass) { }
-
-  const char *getName() const override {
-    return "reorder-functions";
-  }
-  void runOnFunctions(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs,
-                      std::set<uint64_t> &LargeFunctions) override;
-};
-
 } // namespace bolt
 } // namespace llvm

--- a/bolt/Passes/CMakeLists.txt
+++ b/bolt/Passes/CMakeLists.txt
@ -1,5 +1,6 @@
 add_llvm_library(LLVMBOLTPasses
  BinaryPasses.cpp
+  CallGraph.cpp
  DataflowAnalysis.cpp
  DataflowInfoManager.cpp
  FrameAnalysis.cpp
@ -9,7 +10,9 @@ add_llvm_library(LLVMBOLTPasses
  IndirectCallPromotion.cpp
  Inliner.cpp
  LivenessAnalysis.cpp
+  PettisAndHansen.cpp
  ReorderAlgorithm.cpp
+  ReorderFunctions.cpp
  StackPointerTracking.cpp
  )

--- a/bolt/Passes/CallGraph.cpp
+++ b/bolt/Passes/CallGraph.cpp
@ -0,0 +1,262 @@
+//===--- Passes/CallGraph.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "CallGraph.h"
+#include "BinaryFunction.h"
+#include "BinaryContext.h"
+
+#define DEBUG_TYPE "callgraph"
+
+#if defined(__x86_64__) && !defined(_MSC_VER)
+#  if (!defined USE_SSECRC)
+#    define USE_SSECRC
+#  endif
+#else
+#  undef USE_SSECRC
+#endif
+
+namespace {
+
+inline size_t hash_int64_fallback(int64_t key) {
+  // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
+  // http://www.concentric.net/~ttwang/tech/inthash.htm
+  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+  key = key ^ ((unsigned long long)key >> 24);
+  key = (key + (key << 3)) + (key << 8); // key * 265
+  key = key ^ ((unsigned long long)key >> 14);
+  key = (key + (key << 2)) + (key << 4); // key * 21
+  key = key ^ ((unsigned long long)key >> 28);
+  return static_cast<size_t>(static_cast<uint32_t>(key));
+}
+
+inline size_t hash_int64(int64_t k) {
+#if defined(USE_SSECRC) && defined(__SSE4_2__)
+  size_t h = 0;
+  __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
+  return h;
+#else
+  return hash_int64_fallback(k);
+#endif
+}
+  
+inline size_t hash_int64_pair(int64_t k1, int64_t k2) {
+#if defined(USE_SSECRC) && defined(__SSE4_2__)
+  // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes
+  // differently from (k2, k1).
+  k1 += k1;
+  __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2));
+  return k1;
+#else
+  return (hash_int64(k1) << 1) ^ hash_int64(k2);
+#endif
+}
+  
+}
+
+namespace llvm {
+namespace bolt {
+
+int64_t CallGraph::Arc::Hash::operator()(const Arc &Arc) const {
+#ifdef USE_STD_HASH
+  std::hash<int64_t> Hasher;
+  return hashCombine(Hasher(Arc.Src), Arc.Dst);
+#else
+  return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst));
+#endif
+}
+
+CallGraph buildCallGraph(BinaryContext &BC,
+                         std::map<uint64_t, BinaryFunction> &BFs,
+                         std::function<bool (const BinaryFunction &BF)> Filter,
+                         bool IncludeColdCalls,
+                         bool UseFunctionHotSize,
+                         bool UseEdgeCounts) {
+  CallGraph Cg;
+
+  // Add call graph nodes.
+  auto lookupNode = [&](BinaryFunction *Function) {
+    auto It = Cg.FuncToNodeId.find(Function);
+    if (It == Cg.FuncToNodeId.end()) {
+      // It's ok to use the hot size here when the function is split.  This is
+      // because emitFunctions will emit the hot part first in the order that is
+      // computed by ReorderFunctions.  The cold part will be emitted with the
+      // rest of the cold functions and code.
+      const auto Size = UseFunctionHotSize && Function->isSplit()
+        ? Function->estimateHotSize()
+        : Function->estimateSize();
+      const auto Id = Cg.addNode(Size);
+      assert(size_t(Id) == Cg.Funcs.size());
+      Cg.Funcs.push_back(Function);
+      Cg.FuncToNodeId[Function] = Id;
+      // NOTE: for functions without a profile, we set the number of samples
+      // to zero.  This will keep these functions from appearing in the hot
+      // section.  This is a little weird because we wouldn't be trying to
+      // create a node for a function unless it was the target of a call from
+      // a hot block.  The alternative would be to set the count to one or
+      // accumulate the number of calls from the callsite into the function
+      // samples.  Results from perfomance testing seem to favor the zero
+      // count though, so I'm leaving it this way for now.
+      Cg.Nodes[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0;
+      assert(Cg.Funcs[Id] == Function);
+      return Id;
+    } else {
+      return It->second;
+    }
+  };
+
+  // Add call graph edges.
+  uint64_t NotProcessed = 0;
+  uint64_t TotalCalls = 0;
+  for (auto &It : BFs) {
+    auto *Function = &It.second;
+
+    if(Filter(*Function)) {
+      continue;
+    }
+
+    auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames());
+    const auto SrcId = lookupNode(Function);
+    uint64_t Offset = Function->getAddress();
+
+    auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
+      if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) {
+        const auto DstId = lookupNode(DstFunc);
+        auto &A = Cg.incArcWeight(SrcId, DstId, Count);
+        if (!UseEdgeCounts) {
+          A.AvgCallOffset += (Offset - DstFunc->getAddress());
+        }
+        DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
+              << " -> " << *DstFunc << " @ " << Offset << "\n");
+        return true;
+      }
+      return false;
+    };
+
+    for (auto *BB : Function->layout()) {
+      // Don't count calls from cold blocks
+      if (BB->isCold() && !IncludeColdCalls)
+        continue;
+
+      for (auto &Inst : *BB) {
+        // Find call instructions and extract target symbols from each one.
+        if (!BC.MIA->isCall(Inst))
+          continue;
+
+        ++TotalCalls;
+        if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) {
+          // For direct calls, just use the BB execution count.
+          const auto Count = UseEdgeCounts && BB->hasProfile()
+                           ? BB->getExecutionCount() : 1;
+          if (!recordCall(DstSym, Count))
+            ++NotProcessed;
+        } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) {
+          // For indirect calls and jump tables, use branch data.
+          if(!BranchDataOrErr) {
+            ++NotProcessed;
+            continue;
+          }
+          const FuncBranchData &BranchData = BranchDataOrErr.get();
+          const auto DataOffset =
+            BC.MIA->getAnnotationAs<uint64_t>(Inst, "EdgeCountData");
+
+          for (const auto &BI : BranchData.getBranchRange(DataOffset)) {
+            // Count each target as a separate call.
+            ++TotalCalls;
+
+            if (!BI.To.IsSymbol) {
+              ++NotProcessed;
+              continue;
+            }
+
+            auto Itr = BC.GlobalSymbols.find(BI.To.Name);
+            if (Itr == BC.GlobalSymbols.end()) {
+              ++NotProcessed;
+              continue;
+            }
+
+            const auto *DstSym =
+              BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat");
+
+            if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1))
+              ++NotProcessed;
+          }
+        }
+
+        if (!UseEdgeCounts) {
+          Offset += BC.computeCodeSize(&Inst, &Inst + 1);
+        }
+      }
+    }
+  }
+
+  outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed
+         << " callsites not processed out of " << TotalCalls << "\n";
+
+  return Cg;
+}
+
+CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint32_t Samples) {
+  auto Id = Nodes.size();
+  Nodes.emplace_back(Size, Samples);
+  return Id;
+}
+
+const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W) {
+  auto Res = Arcs.emplace(Src, Dst, W);
+  if (!Res.second) {
+    Res.first->Weight += W;
+    return *Res.first;
+  }
+  Nodes[Src].Succs.push_back(Dst);
+  Nodes[Dst].Preds.push_back(Src);
+  return *Res.first;
+}
+
+std::deque<BinaryFunction *> CallGraph::buildTraversalOrder() {
+  std::deque<BinaryFunction *> TopologicalOrder;
+  enum NodeStatus { NEW, VISITING, VISITED };
+  std::vector<NodeStatus> NodeStatus(Funcs.size());
+  std::stack<NodeId> Worklist;
+
+  for (auto *Func : Funcs) {
+    const auto Id = FuncToNodeId.at(Func);
+    Worklist.push(Id);
+    NodeStatus[Id] = NEW;
+  }
+
+  while (!Worklist.empty()) {
+    const auto FuncId = Worklist.top();
+    Worklist.pop();
+
+    if (NodeStatus[FuncId] == VISITED)
+      continue;
+
+    if (NodeStatus[FuncId] == VISITING) {
+      TopologicalOrder.push_back(Funcs[FuncId]);
+      NodeStatus[FuncId] = VISITED;
+      continue;
+    }
+
+    assert(NodeStatus[FuncId] == NEW);
+    NodeStatus[FuncId] = VISITING;
+    Worklist.push(FuncId);
+    for (const auto Callee : Nodes[FuncId].Succs) {
+      if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
+        continue;
+      Worklist.push(Callee);
+    }
+  }
+
+  return TopologicalOrder;
+}
+
+}
+}
--- a/bolt/Passes/CallGraph.h
+++ b/bolt/Passes/CallGraph.h
@ -0,0 +1,113 @@
+//===--- Passes/CallGraph.h -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
+
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <functional>
+#include <map>
+#include <deque>
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+class BinaryContext;
+
+// TODO: find better place for this
+inline int64_t hashCombine(const int64_t Seed, const int64_t Val) {
+  std::hash<int64_t> Hasher;
+  return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2));
+}
+
+/// A call graph class.
+class CallGraph {
+public:
+  using NodeId = size_t;
+  static constexpr NodeId InvalidId = -1;
+
+  class Arc {
+  public:
+    struct Hash {
+      int64_t operator()(const Arc &Arc) const;
+    };
+
+    Arc(NodeId S, NodeId D, double W = 0)
+      : Src(S)
+      , Dst(D)
+      , Weight(W)
+    {}
+    Arc(const Arc&) = delete;
+
+    friend bool operator==(const Arc &Lhs, const Arc &Rhs) {
+      return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst;
+    }
+
+    const NodeId Src;
+    const NodeId Dst;
+    mutable double Weight;
+    mutable double NormalizedWeight{0};
+    mutable double AvgCallOffset{0};
+  };
+
+  class Node {
+  public:
+    explicit Node(uint32_t Size, uint32_t Samples = 0)
+      : Size(Size), Samples(Samples)
+    {}
+
+    uint32_t Size;
+    uint32_t Samples;
+
+    // preds and succs contain no duplicate elements and self arcs are not allowed
+    std::vector<NodeId> Preds;
+    std::vector<NodeId> Succs;
+  };
+
+  NodeId addNode(uint32_t Size, uint32_t Samples = 0);
+  const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0);
+
+  /// Compute a DFS traversal of the call graph.
+  std::deque<BinaryFunction *> buildTraversalOrder();
+
+  std::vector<Node> Nodes;
+  std::unordered_set<Arc, Arc::Hash> Arcs;
+  std::vector<BinaryFunction *> Funcs;
+  std::unordered_map<const BinaryFunction *, NodeId> FuncToNodeId;
+};
+
+inline bool NoFilter(const BinaryFunction &) { return false; }
+
+/// Builds a call graph from the map of BinaryFunctions provided in BFs.
+/// The arguments control how the graph is constructed.
+/// Filter is called on each function, any function that it returns true for
+/// is omitted from the graph.
+/// If IncludeColdCalls is true, then calls from cold BBs are considered for the
+/// graph, otherwise they are ignored.
+/// UseFunctionHotSize controls whether the hot size of a function is used when
+/// filling in the Size attribute of new Nodes.
+/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is
+/// computed using the offsets of call instructions.
+CallGraph buildCallGraph(BinaryContext &BC,
+                         std::map<uint64_t, BinaryFunction> &BFs,
+                         std::function<bool (const BinaryFunction &BF)> Filter = NoFilter,
+                         bool IncludeColdCalls = true,
+                         bool UseFunctionHotSize = false,
+                         bool UseEdgeCounts = false);
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
--- a/bolt/Passes/FrameAnalysis.cpp
+++ b/bolt/Passes/FrameAnalysis.cpp
@ -275,71 +275,6 @@ FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const {
  return make_error_code(errc::result_out_of_range);
 }

-void FrameAnalysis::buildCallGraph(BinaryContext &BC,
-                                   std::map<uint64_t, BinaryFunction> &BFs) {
-  for (auto &I : BFs) {
-    BinaryFunction &Caller = I.second;
-
-    Functions.emplace(&Caller);
-
-    for (BinaryBasicBlock &BB : Caller) {
-      for (MCInst &Inst : BB) {
-        if (!BC.MIA->isCall(Inst))
-          continue;
-
-        auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
-        if (!TargetSymbol) {
-          // This is an indirect call, we cannot record a target.
-          continue;
-        }
-
-        auto *Function = BC.getFunctionForSymbol(TargetSymbol);
-        if (!Function) {
-          // Call to a function without a BinaryFunction object.
-          continue;
-        }
-        // Create a new edge in the call graph
-        CallGraphEdges[&Caller].emplace_back(Function);
-        ReverseCallGraphEdges[Function].emplace_back(&Caller);
-      }
-    }
-  }
-}
-
-void FrameAnalysis::buildCGTraversalOrder() {
-  enum NodeStatus { NEW, VISITING, VISITED };
-  std::unordered_map<const BinaryFunction *, NodeStatus> NodeStatus;
-  std::stack<BinaryFunction *> Worklist;
-
-  for (auto *Func : Functions) {
-    Worklist.push(Func);
-    NodeStatus[Func] = NEW;
-  }
-
-  while (!Worklist.empty()) {
-    auto *Func = Worklist.top();
-    Worklist.pop();
-
-    if (NodeStatus[Func] == VISITED)
-      continue;
-
-    if (NodeStatus[Func] == VISITING) {
-      TopologicalCGOrder.push_back(Func);
-      NodeStatus[Func] = VISITED;
-      continue;
-    }
-
-    assert(NodeStatus[Func] == NEW);
-    NodeStatus[Func] = VISITING;
-    Worklist.push(Func);
-    for (auto *Callee : CallGraphEdges[Func]) {
-      if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
-        continue;
-      Worklist.push(Callee);
-    }
-  }
-}
-
 void FrameAnalysis::getInstClobberList(const BinaryContext &BC,
                                       const MCInst &Inst,
                                       BitVector &KillSet) const {
@ -412,8 +347,8 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) {
    }

    if (RegsKilledMap[Func] != RegsKilled || Updated) {
-      for (auto Caller : ReverseCallGraphEdges[Func]) {
-        Queue.push(Caller);
+      for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) {
+        Queue.push(Cg.Funcs[Caller]);
      }
    }
    RegsKilledMap[Func] = std::move(RegsKilled);
@ -647,11 +582,11 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC,
                                   std::set<uint64_t> &) {
  {
    NamedRegionTimer T1("Callgraph construction", "FOP breakdown", true);
-    buildCallGraph(BC, BFs);
+    Cg = buildCallGraph(BC, BFs);
  }
  {
    NamedRegionTimer T1("build cg traversal order", "FOP breakdown", true);
-    buildCGTraversalOrder();
+    TopologicalCGOrder = Cg.buildTraversalOrder();
  }
  {
    NamedRegionTimer T1("build clobber map", "FOP breakdown", true);
--- a/bolt/Passes/FrameAnalysis.h
+++ b/bolt/Passes/FrameAnalysis.h
@ -13,6 +13,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H

 #include "BinaryPasses.h"
+#include "CallGraph.h"
 #include "StackPointerTracking.h"

 namespace llvm {
@ -112,14 +113,8 @@ raw_ostream &operator<<(raw_ostream &OS,
 ///
 class FrameAnalysis : public BinaryFunctionPass {
  /// Call graph info
-  /// The set of functions analyzed by our call graph
-  std::set<BinaryFunction *> Functions;
-  /// Model the "function calls function" edges
-  std::map<const BinaryFunction *, std::vector<BinaryFunction *>>
-      CallGraphEdges;
-  /// Model the "function called by function" edges
-  std::map<const BinaryFunction *, std::vector<BinaryFunction *>>
-      ReverseCallGraphEdges;
+  CallGraph Cg;
+
  /// DFS or reverse post-ordering of the call graph nodes to allow us to
  /// traverse the call graph bottom-up
  std::deque<BinaryFunction *> TopologicalCGOrder;
@ -169,15 +164,6 @@ class FrameAnalysis : public BinaryFunctionPass {
  void addFIEFor(const BinaryContext &BC, MCInst &Inst,
                 const FrameIndexEntry &FIE);

-  /// Perform the initial step of populating CallGraphEdges and
-  /// ReverseCallGraphEdges for all functions in BFs.
-  void buildCallGraph(BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs);
-
-  /// Compute a DFS traversal of the call graph in Functions, CallGraphEdges
-  /// and ReverseCallGraphEdges and stores it in TopologicalCGOrder.
-  void buildCGTraversalOrder();
-
  /// Compute the set of registers \p Func may write to during its execution,
  /// starting at the point when it is called up until when it returns. Returns
  /// a BitVector the size of the target number of registers, representing the
--- a/bolt/Passes/FrameOptimizer.cpp
+++ b/bolt/Passes/FrameOptimizer.cpp
@ -24,71 +24,6 @@ extern cl::opt<unsigned> Verbosity;
 namespace llvm {
 namespace bolt {

-void FrameOptimizerPass::buildCallGraph(
-    const BinaryContext &BC, std::map<uint64_t, BinaryFunction> &BFs) {
-  for (auto &I : BFs) {
-    BinaryFunction &Caller = I.second;
-
-    Functions.emplace(&Caller);
-
-    for (BinaryBasicBlock &BB : Caller) {
-      for (MCInst &Inst : BB) {
-        if (!BC.MIA->isCall(Inst))
-          continue;
-
-        const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
-        if (!TargetSymbol) {
-          // This is an indirect call, we cannot record a target.
-          continue;
-        }
-
-        const auto *Function = BC.getFunctionForSymbol(TargetSymbol);
-        if (!Function) {
-          // Call to a function without a BinaryFunction object.
-          continue;
-        }
-        // Create a new edge in the call graph
-        CallGraphEdges[&Caller].emplace_back(Function);
-        ReverseCallGraphEdges[Function].emplace_back(&Caller);
-      }
-    }
-  }
-}
-
-void FrameOptimizerPass::buildCGTraversalOrder() {
-  enum NodeStatus { NEW, VISITING, VISITED };
-  std::unordered_map<const BinaryFunction *, NodeStatus> NodeStatus;
-  std::stack<const BinaryFunction *> Worklist;
-
-  for (auto *Func : Functions) {
-    Worklist.push(Func);
-    NodeStatus[Func] = NEW;
-  }
-
-  while (!Worklist.empty()) {
-    const auto *Func = Worklist.top();
-    Worklist.pop();
-
-    if (NodeStatus[Func] == VISITED)
-      continue;
-
-    if (NodeStatus[Func] == VISITING) {
-      TopologicalCGOrder.push_back(Func);
-      NodeStatus[Func] = VISITED;
-      continue;
-    }
-
-    assert(NodeStatus[Func] == NEW);
-    NodeStatus[Func] = VISITING;
-    Worklist.push(Func);
-    for (const auto *Callee : CallGraphEdges[Func]) {
-      if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
-        continue;
-      Worklist.push(Callee);
-    }
-  }
-}
-
 void FrameOptimizerPass::getInstClobberList(const BinaryContext &BC,
                                            const MCInst &Inst,
                                            BitVector &KillSet) const {
@ -161,8 +96,8 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) {
    }

    if (RegsKilledMap[Func] != RegsKilled) {
-      for (auto Caller : ReverseCallGraphEdges[Func]) {
-        Queue.push(Caller);
+      for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) {
+        Queue.push(Cg.Funcs[Caller]);
      }
    }
    RegsKilledMap[Func] = std::move(RegsKilled);
@ -794,8 +729,8 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
  uint64_t CountFunctionsNotOptimized{0};
  uint64_t CountFunctionsFailedRestoreFI{0};
  uint64_t CountDenominator{0};
-  buildCallGraph(BC, BFs);
-  buildCGTraversalOrder();
+  Cg = buildCallGraph(BC, BFs);
+  TopologicalCGOrder = Cg.buildTraversalOrder();
  buildClobberMap(BC);
  for (auto &I : BFs) {
    auto Count = I.second.getExecutionCount();
--- a/bolt/Passes/FrameOptimizer.h
+++ b/bolt/Passes/FrameOptimizer.h
@ -13,6 +13,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H

 #include "BinaryPasses.h"
+#include "CallGraph.h"

 namespace llvm {
 namespace bolt {
@ -75,17 +76,11 @@ class FrameOptimizerPass : public BinaryFunctionPass {
  uint64_t CountFunctionsAllClobber{0};

  /// Call graph info
-  /// The set of functions analyzed by our call graph
-  std::set<BinaryFunction *> Functions;
-  /// Model the "function calls function" edges
-  std::map<const BinaryFunction *, std::vector<const BinaryFunction *>>
-      CallGraphEdges;
-  /// Model the "function called by function" edges
-  std::map<const BinaryFunction *, std::vector<const BinaryFunction *>>
-      ReverseCallGraphEdges;
+  CallGraph Cg;
+
  /// DFS or reverse post-ordering of the call graph nodes to allow us to
  /// traverse the call graph bottom-up
-  std::deque<const BinaryFunction *> TopologicalCGOrder;
+  std::deque<BinaryFunction *> TopologicalCGOrder;

  /// Map functions to the set of registers they may overwrite starting at when
  /// it is called until it returns to the caller.
@ -126,15 +121,6 @@ public:
  void getInstClobberList(const BinaryContext &BC, const MCInst &Inst,
                          BitVector &KillSet) const;
 private:
-  /// Perform the initial step of populating CallGraphEdges and
-  /// ReverseCallGraphEdges for all functions in BFs.
-  void buildCallGraph(const BinaryContext &BC,
-                      std::map<uint64_t, BinaryFunction> &BFs);
-
-  /// Compute a DFS traversal of the call graph in Functions, CallGraphEdges
-  /// and ReverseCallGraphEdges and stores it in TopologicalCGOrder.
-  void buildCGTraversalOrder();
-
  /// Compute the set of registers \p Func may write to during its execution,
  /// starting at the point when it is called up until when it returns. Returns
  /// a BitVector the size of the target number of registers, representing the
--- a/bolt/Passes/HFSort.cpp
+++ b/bolt/Passes/HFSort.cpp
@ -40,6 +40,10 @@
 namespace llvm {
 namespace bolt {

+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;  
+
 namespace {

 // The number of pages to reserve for the functions with highest
@ -55,32 +59,11 @@ constexpr double MinArcProbability = 0.1;
 // willing to degrade it's density by merging a callee.
 constexpr int CallerDegradeFactor = 8;

-// Maximum size of a cluster, in bytes.
-constexpr uint32_t MaxClusterSize = 1 << 20;
-
-constexpr uint32_t PageSize = 2 << 20;
-
 }
+
 ////////////////////////////////////////////////////////////////////////////////

-TargetId TargetGraph::addTarget(uint32_t Size, uint32_t Samples) {
-  auto Id = Targets.size();
-  Targets.emplace_back(Size, Samples);
-  return Id;
-}
-
-const Arc &TargetGraph::incArcWeight(TargetId Src, TargetId Dst, double W) {
-  auto Res = Arcs.emplace(Src, Dst, W);
-  if (!Res.second) {
-    Res.first->Weight += W;
-    return *Res.first;
-  }
-  Targets[Src].Succs.push_back(Dst);
-  Targets[Dst].Preds.push_back(Src);
-  return *Res.first;
-}
-
-Cluster::Cluster(TargetId Id, const TargetNode &Func) {
+Cluster::Cluster(NodeId Id, const Node &Func) {
  Targets.push_back(Id);
  Size = Func.Size;
  Samples = Func.Samples;
@ -103,53 +86,47 @@ std::string Cluster::toString() const {
 }

 namespace {
-////////////////////////////////////////////////////////////////////////////////

-bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
-  return C1.density() > C2.density();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void freezeClusters(const TargetGraph &Cg, std::vector<Cluster> &Clusters) {
+void freezeClusters(const CallGraph &Cg, std::vector<Cluster> &Clusters) {
  uint32_t TotalSize = 0;
  std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity);
  for (auto &C : Clusters) {
    uint32_t NewSize = TotalSize + C.Size;
-    if (NewSize > FrozenPages * PageSize) break;
+    if (NewSize > FrozenPages * HugePageSize) break;
    C.Frozen = true;
    TotalSize = NewSize;
    auto Fid = C.Targets[0];
    DEBUG(dbgs() <<
          format("freezing cluster for func %d, size = %u, samples = %u)\n",
-                 Fid, Cg.Targets[Fid].Size, Cg.Targets[Fid].Samples););
+                 Fid, Cg.Nodes[Fid].Size, Cg.Nodes[Fid].Samples););
  }
 }

-void mergeInto(Cluster &Into, Cluster&& Other, const double Aw = 0) {
-  Into.Targets.insert(Into.Targets.end(),
-                      Other.Targets.begin(),
-                      Other.Targets.end());
-  Into.Size += Other.Size;
-  Into.Samples += Other.Samples;
+}
+
+void Cluster::merge(Cluster&& Other, const double Aw) {
+  Targets.insert(Targets.end(),
+                 Other.Targets.begin(),
+                 Other.Targets.end());
+  Size += Other.Size;
+  Samples += Other.Samples;

  Other.Size = 0;
  Other.Samples = 0;
  Other.Targets.clear();
 }
-}

-std::vector<Cluster> clusterize(const TargetGraph &Cg) {
-  std::vector<TargetId> SortedFuncs;
+std::vector<Cluster> clusterize(const CallGraph &Cg) {
+  std::vector<NodeId> SortedFuncs;

-  // indexed by TargetId, keeps it's current cluster
-  std::vector<Cluster*> FuncCluster(Cg.Targets.size(), nullptr);
+  // indexed by NodeId, keeps it's current cluster
+  std::vector<Cluster*> FuncCluster(Cg.Nodes.size(), nullptr);
  std::vector<Cluster> Clusters;
-  Clusters.reserve(Cg.Targets.size());
+  Clusters.reserve(Cg.Nodes.size());

-  for (TargetId F = 0; F < Cg.Targets.size(); F++) {
-    if (Cg.Targets[F].Samples == 0) continue;
-    Clusters.emplace_back(F, Cg.Targets[F]);
+  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
+    if (Cg.Nodes[F].Samples == 0) continue;
+    Clusters.emplace_back(F, Cg.Nodes[F]);
    SortedFuncs.push_back(F);
  }

@ -164,9 +141,9 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
  std::sort(
    SortedFuncs.begin(),
    SortedFuncs.end(),
-    [&] (const TargetId F1, const TargetId F2) {
-      const auto &Func1 = Cg.Targets[F1];
-      const auto &Func2 = Cg.Targets[F2];
+    [&] (const NodeId F1, const NodeId F2) {
+      const auto &Func1 = Cg.Nodes[F1];
+      const auto &Func2 = Cg.Nodes[F2];
      return
        (uint64_t)Func1.Samples * Func2.Size >  // TODO: is this correct?
        (uint64_t)Func2.Samples * Func1.Size;
@ -180,12 +157,12 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
    if (Cluster->Frozen) continue;

    // Find best predecessor.
-    TargetId BestPred = InvalidId;
+    NodeId BestPred = CallGraph::InvalidId;
    double BestProb = 0;

-    for (const auto Src : Cg.Targets[Fid].Preds) {
+    for (const auto Src : Cg.Nodes[Fid].Preds) {
      auto &A = *Cg.Arcs.find(Arc(Src, Fid));
-      if (BestPred == InvalidId || A.NormalizedWeight > BestProb) {
+      if (BestPred == CallGraph::InvalidId || A.NormalizedWeight > BestProb) {
        BestPred = A.Src;
        BestProb = A.NormalizedWeight;
      }
@ -196,7 +173,7 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
    //   caller is too low.
    if (BestProb < MinArcProbability) continue;

-    assert(BestPred != InvalidId);
+    assert(BestPred != CallGraph::InvalidId);

    auto PredCluster = FuncCluster[BestPred];

@ -223,13 +200,13 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
    DEBUG(dbgs() << format("merging %s -> %s: %u\n",
                           PredCluster->toString().c_str(),
                           Cluster->toString().c_str(),
-                           Cg.Targets[Fid].Samples););
+                           Cg.Nodes[Fid].Samples););

    for (auto F : Cluster->Targets) {
      FuncCluster[F] = PredCluster;
    }

-    mergeInto(*PredCluster, std::move(*Cluster));
+    PredCluster->merge(std::move(*Cluster));
  }

  // Return the set of Clusters that are left, which are the ones that
@ -250,203 +227,14 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
  return SortedClusters;
 }

-////////////////////////////////////////////////////////////////////////////////
-
-namespace {
-class ClusterArc {
-public:
-  ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0)
-    : C1(std::min(Ca, Cb))
-    , C2(std::max(Ca, Cb))
-    , Weight(W)
-  {}
-
-  friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) {
-    return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2;
-  }
-
-  Cluster *const C1;
-  Cluster *const C2;
-  mutable double Weight;
-};
-
-class ClusterArcHash {
-public:
-  int64_t operator()(const ClusterArc &Arc) const {
-    std::hash<int64_t> Hasher;
-    return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2));
-  }
-};
-
-using ClusterArcSet = std::unordered_set<ClusterArc, ClusterArcHash>;
-
-void orderFuncs(const TargetGraph &Cg, Cluster *C1, Cluster *C2) {
-  TargetId C1head = C1->Targets.front();
-  TargetId C1tail = C1->Targets.back();
-  TargetId C2head = C2->Targets.front();
-  TargetId C2tail = C2->Targets.back();
-
-  double C1headC2head = 0;
-  double C1headC2tail = 0;
-  double C1tailC2head = 0;
-  double C1tailC2tail = 0;
-
-  for (const auto &Arc : Cg.Arcs) {
-    if ((Arc.Src == C1head && Arc.Dst == C2head) ||
-        (Arc.Dst == C1head && Arc.Src == C2head)) {
-      C1headC2head += Arc.Weight;
-    } else if ((Arc.Src == C1head && Arc.Dst == C2tail) ||
-               (Arc.Dst == C1head && Arc.Src == C2tail)) {
-      C1headC2tail += Arc.Weight;
-    } else if ((Arc.Src == C1tail && Arc.Dst == C2head) ||
-               (Arc.Dst == C1tail && Arc.Src == C2head)) {
-      C1tailC2head += Arc.Weight;
-    } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) ||
-               (Arc.Dst == C1tail && Arc.Src == C2tail)) {
-      C1tailC2tail += Arc.Weight;
-    }
-  }
-
-  const double Max = std::max(std::max(C1headC2head, C1headC2tail),
-                              std::max(C1tailC2head, C1tailC2tail));
-
-  if (C1headC2head == Max) {
-    // flip C1
-    std::reverse(C1->Targets.begin(), C1->Targets.end());
-  } else if (C1headC2tail == Max) {
-    // flip C1 C2
-    std::reverse(C1->Targets.begin(), C1->Targets.end());
-    std::reverse(C2->Targets.begin(), C2->Targets.end());
-  } else if (C1tailC2tail == Max) {
-    // flip C2
-    std::reverse(C2->Targets.begin(), C2->Targets.end());
-  }
-}
-}
-
-std::vector<Cluster> pettisAndHansen(const TargetGraph &Cg) {
-  // indexed by TargetId, keeps its current cluster
-  std::vector<Cluster*> FuncCluster(Cg.Targets.size(), nullptr);
+std::vector<Cluster> randomClusters(const CallGraph &Cg) {
+  std::vector<NodeId> FuncIds(Cg.Nodes.size(), 0);
  std::vector<Cluster> Clusters;
-  std::vector<TargetId> Funcs;
+  Clusters.reserve(Cg.Nodes.size());  

-  Clusters.reserve(Cg.Targets.size());
-
-  for (TargetId F = 0; F < Cg.Targets.size(); F++) {
-    if (Cg.Targets[F].Samples == 0) continue;
-    Clusters.emplace_back(F, Cg.Targets[F]);
-    FuncCluster[F] = &Clusters.back();
-    Funcs.push_back(F);
-  }
-
-  ClusterArcSet Carcs;
-
-  auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) {
-    auto Res = Carcs.emplace(C1, C2, Weight);
-    if (!Res.second) {
-      Res.first->Weight += Weight;
-    }
-  };
-
-  // Create a std::vector of cluster arcs
-
-  for (auto &Arc : Cg.Arcs) {
-    if (Arc.Weight == 0) continue;
-
-    auto const S = FuncCluster[Arc.Src];
-    auto const D = FuncCluster[Arc.Dst];
-
-    // ignore if s or d is nullptr
-
-    if (S == nullptr || D == nullptr) continue;
-
-    // ignore self-edges
-
-    if (S == D) continue;
-
-    insertOrInc(S, D, Arc.Weight);
-  }
-
-  // Find an arc with max weight and merge its nodes
-
-  while (!Carcs.empty()) {
-    auto Maxpos = std::max_element(
-      Carcs.begin(),
-      Carcs.end(),
-      [&] (const ClusterArc &Carc1, const ClusterArc &Carc2) {
-        return Carc1.Weight < Carc2.Weight;
-      }
-    );
-
-    auto Max = *Maxpos;
-    Carcs.erase(Maxpos);
-
-    auto const C1 = Max.C1;
-    auto const C2 = Max.C2;
-
-    if (C1->Size + C2->Size > MaxClusterSize) continue;
-
-    if (C1->Frozen || C2->Frozen) continue;
-
-    // order functions and merge cluster
-
-    orderFuncs(Cg, C1, C2);
-
-    DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(),
-          C1->toString().c_str(), Max.Weight););
-
-    // update carcs: merge C1arcs to C2arcs
-
-    std::unordered_map<ClusterArc, Cluster *, ClusterArcHash> C2arcs;
-    for (auto &Carc : Carcs) {
-      if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2);
-      if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1);
-    }
-
-    for (auto It : C2arcs) {
-      auto const C = It.second;
-      auto const C2arc = It.first;
-
-      insertOrInc(C, C1, C2arc.Weight);
-      Carcs.erase(C2arc);
-    }
-
-    // update FuncCluster
-
-    for (auto F : C2->Targets) {
-      FuncCluster[F] = C1;
-    }
-    mergeInto(*C1, std::move(*C2), Max.Weight);
-  }
-
-  // Return the set of Clusters that are left, which are the ones that
-  // didn't get merged.
-
-  std::set<Cluster*> LiveClusters;
-  std::vector<Cluster> OutClusters;
-
-  for (auto Fid : Funcs) {
-    LiveClusters.insert(FuncCluster[Fid]);
-  }
-  for (auto C : LiveClusters) {
-    OutClusters.push_back(std::move(*C));
-  }
-
-  std::sort(OutClusters.begin(),
-            OutClusters.end(),
-            compareClustersDensity);
-
-  return OutClusters;
-}
-
-std::vector<Cluster> randomClusters(const TargetGraph &Cg) {
-  std::vector<TargetId> FuncIds(Cg.Targets.size(), 0);
-  std::vector<Cluster> Clusters;
-  Clusters.reserve(Cg.Targets.size());  
-
-  for (TargetId F = 0; F < Cg.Targets.size(); F++) {
-    if (Cg.Targets[F].Samples == 0) continue;
-    Clusters.emplace_back(F, Cg.Targets[F]);
+  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
+    if (Cg.Nodes[F].Samples == 0) continue;
+    Clusters.emplace_back(F, Cg.Nodes[F]);
  }

  std::sort(Clusters.begin(),
@ -477,7 +265,7 @@ std::vector<Cluster> randomClusters(const TargetGraph &Cg) {
    if (MergeIdx == Clusters.size()) {
      ++Idx;
    } else {
-      mergeInto(Clusters[Idx], std::move(Clusters[MergeIdx]));
+      Clusters[Idx].merge(std::move(Clusters[MergeIdx]));
      Clusters.erase(Clusters.begin() + MergeIdx);
    }
  }
--- a/bolt/Passes/HFSort.h
+++ b/bolt/Passes/HFSort.h
@ -37,157 +37,60 @@
 #ifndef LLVM_TOOLS_LLVM_BOLT_HFSORT_H
 #define LLVM_TOOLS_LLVM_BOLT_HFSORT_H

-#include <string>
-#include <unordered_set>
-#include <vector>
-#include <functional>
+#include "CallGraph.h"

-#if defined(__x86_64__) && !defined(_MSC_VER)
-#  if (!defined USE_SSECRC)
-#    define USE_SSECRC
-#  endif
-#else
-#  undef USE_SSECRC
-#endif
+#include <string>
+#include <vector>

 namespace llvm {
 namespace bolt {

-using TargetId = size_t;
-constexpr TargetId InvalidId = -1;
-
-class Arc {
-public:
-  Arc(TargetId S, TargetId D, double W = 0)
-      : Src(S)
-      , Dst(D)
-      , Weight(W)
-  {}
-  Arc(const Arc&) = delete;
-
-  friend bool operator==(const Arc &Lhs, const Arc &Rhs) {
-    return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst;
-  }
-
-  const TargetId Src;
-  const TargetId Dst;
-  mutable double Weight;
-  mutable double NormalizedWeight{0};
-  mutable double AvgCallOffset{0};
-};
-
-namespace {
-
-inline int64_t hashCombine(const int64_t Seed, const int64_t Val) {
-  std::hash<int64_t> Hasher;
-  return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2));
-}
-
-inline size_t hash_int64_fallback(int64_t key) {
-  // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
-  // http://www.concentric.net/~ttwang/tech/inthash.htm
-  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
-  key = key ^ ((unsigned long long)key >> 24);
-  key = (key + (key << 3)) + (key << 8); // key * 265
-  key = key ^ ((unsigned long long)key >> 14);
-  key = (key + (key << 2)) + (key << 4); // key * 21
-  key = key ^ ((unsigned long long)key >> 28);
-  return static_cast<size_t>(static_cast<uint32_t>(key));
-}
-
-inline size_t hash_int64(int64_t k) {
-#if defined(USE_SSECRC) && defined(__SSE4_2__)
-  size_t h = 0;
-  __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
-  return h;
-#else
-  return hash_int64_fallback(k);
-#endif
-}
-  
-inline size_t hash_int64_pair(int64_t k1, int64_t k2) {
-#if defined(USE_SSECRC) && defined(__SSE4_2__)
-  // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes
-  // differently from (k2, k1).
-  k1 += k1;
-  __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2));
-  return k1;
-#else
-  return (hash_int64(k1) << 1) ^ hash_int64(k2);
-#endif
-}
-  
-}
-
-class ArcHash {
-public:
-  int64_t operator()(const Arc &Arc) const {
-#ifdef USE_STD_HASH
-    std::hash<int64_t> Hasher;
-    return hashCombine(Hasher(Arc.Src), Arc.Dst);
-#else
-    return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst));
-#endif
-  }
-};
-
-class TargetNode {
-public:
-  explicit TargetNode(uint32_t Size, uint32_t Samples = 0)
-    : Size(Size), Samples(Samples)
-  {}
-
-  uint32_t Size;
-  uint32_t Samples;
-
-  // preds and succs contain no duplicate elements and self arcs are not allowed
-  std::vector<TargetId> Preds;
-  std::vector<TargetId> Succs;
-};
-
-class TargetGraph {
-public:
-  TargetId addTarget(uint32_t Size, uint32_t Samples = 0);
-  const Arc &incArcWeight(TargetId Src, TargetId Dst, double W = 1.0);
-
-  std::vector<TargetNode> Targets;
-  std::unordered_set<Arc, ArcHash> Arcs;
-};
-
 class Cluster {
 public:
-  Cluster(TargetId Id, const TargetNode &F);
+  Cluster(CallGraph::NodeId Id, const CallGraph::Node &F);

  std::string toString() const;
  double density() const {
    return (double)Samples / Size;
  }

-  std::vector<TargetId> Targets;
+  void merge(Cluster &&Other, const double Aw = 0);
+
+  std::vector<CallGraph::NodeId> Targets;
  uint32_t Samples;
  uint32_t Size;
  bool Frozen; // not a candidate for merging
 };

+// Maximum size of a cluster, in bytes.
+constexpr uint32_t MaxClusterSize = 1 << 20;
+
+// Size of a huge page in bytes.
+constexpr uint32_t HugePageSize = 2 << 20;
+
+inline bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
+  return C1.density() > C2.density();
+}
+
 /*
 * Cluster functions in order to minimize call distance.
 */
-std::vector<Cluster> clusterize(const TargetGraph &Cg);
+std::vector<Cluster> clusterize(const CallGraph &Cg);

 /*
 * Optimize function placement for iTLB cache and i-cache.
 */
-std::vector<Cluster> hfsortPlus(const TargetGraph &Cg);
+std::vector<Cluster> hfsortPlus(const CallGraph &Cg);

 /*
 * Pettis-Hansen code layout algorithm
 * reference: K. Pettis and R. C. Hansen, "Profile Guided Code Positioning",
 * PLDI '90
 */
-std::vector<Cluster> pettisAndHansen(const TargetGraph &Cg);
+std::vector<Cluster> pettisAndHansen(const CallGraph &Cg);

 /* Group functions into clusters randomly. */
-std::vector<Cluster> randomClusters(const TargetGraph &Cg);
+std::vector<Cluster> randomClusters(const CallGraph &Cg);

 }
 }
--- a/bolt/Passes/HFSortPlus.cpp
+++ b/bolt/Passes/HFSortPlus.cpp
@ -43,6 +43,10 @@
 namespace llvm {
 namespace bolt {

+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;  
+
 namespace {

 // The size of a cache page
@ -117,7 +121,7 @@ class PrecomputedResults {
 // A wrapper for algorthm-wide variables
 struct AlgoState {
  // the call graph
-  const TargetGraph *Cg;
+  const CallGraph *Cg;
  // the total number of samples in the graph
  double TotalSamples;
  // target_id => cluster
@ -126,10 +130,6 @@ struct AlgoState {
  std::vector<size_t> Addr;
 };

-bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
-  return C1.density() > C2.density();
-}
-
 }

 /*
@ -199,7 +199,7 @@ double expectedCacheHitRatio(const AlgoState &State,
  sortByDensity(Clusters);

  // generate function addresses with an alignment
-  std::vector<size_t> Addr(State.Cg->Targets.size(), InvalidAddr);
+  std::vector<size_t> Addr(State.Cg->Nodes.size(), InvalidAddr);
  size_t CurAddr = 0;
  // 'hotness' of the pages
  std::vector<double> PageSamples;
@ -207,11 +207,11 @@ double expectedCacheHitRatio(const AlgoState &State,
    for (auto TargetId : Cluster->Targets) {
      if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16;
      Addr[TargetId] = CurAddr;
-      CurAddr += State.Cg->Targets[TargetId].Size;
+      CurAddr += State.Cg->Nodes[TargetId].Size;
      // update page weight
      size_t Page = Addr[TargetId] / PageSize;
      while (PageSamples.size() <= Page) PageSamples.push_back(0.0);
-      PageSamples[Page] += State.Cg->Targets[TargetId].Samples;
+      PageSamples[Page] += State.Cg->Nodes[TargetId].Samples;
    }
  }

@ -220,12 +220,12 @@ double expectedCacheHitRatio(const AlgoState &State,
  for (auto Cluster : Clusters) {
    for (auto TargetId : Cluster->Targets) {
      size_t Page = Addr[TargetId] / PageSize;
-      double Samples = State.Cg->Targets[TargetId].Samples;
+      double Samples = State.Cg->Nodes[TargetId].Samples;
      // probability that the page is not present in the cache
      double MissProb = missProbability(State, PageSamples[Page]);

-      for (auto Pred : State.Cg->Targets[TargetId].Preds) {
-        if (State.Cg->Targets[Pred].Samples == 0) continue;
+      for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
+        if (State.Cg->Nodes[Pred].Samples == 0) continue;
        auto A = State.Cg->Arcs.find(Arc(Pred, TargetId));

        // the source page
@ -252,13 +252,13 @@ std::unordered_set<Cluster *> adjacentClusters(const AlgoState &State,
                                              Cluster *C) {
  std::unordered_set<Cluster *> Result;
  for (auto TargetId : C->Targets) {
-    for (auto Succ : State.Cg->Targets[TargetId].Succs) {
+    for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
      auto SuccCluster = State.FuncCluster[Succ];
      if (SuccCluster != nullptr && SuccCluster != C) {
        Result.insert(SuccCluster);
      }
    }
-    for (auto Pred : State.Cg->Targets[TargetId].Preds) {
+    for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
      auto PredCluster = State.FuncCluster[Pred];
      if (PredCluster != nullptr && PredCluster != C) {
        Result.insert(PredCluster);
@ -286,7 +286,7 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) {
 double shortCalls(const AlgoState &State, Cluster *Cluster) {
  double Calls = 0;
  for (auto TargetId : Cluster->Targets) {
-    for (auto Succ : State.Cg->Targets[TargetId].Succs) {
+    for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
      if (State.FuncCluster[Succ] == Cluster) {
        auto A = State.Cg->Arcs.find(Arc(TargetId, Succ));

@ -310,7 +310,7 @@ double shortCalls(const AlgoState &State,
                  Cluster *ClusterSucc) {
  double Calls = 0;
  for (auto TargetId : ClusterPred->Targets) {
-    for (auto Succ : State.Cg->Targets[TargetId].Succs) {
+    for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
      if (State.FuncCluster[Succ] == ClusterSucc) {
        auto A = State.Cg->Arcs.find(Arc(TargetId, Succ));

@ -323,7 +323,7 @@ double shortCalls(const AlgoState &State,
  }

  for (auto TargetId : ClusterPred->Targets) {
-    for (auto Pred : State.Cg->Targets[TargetId].Preds) {
+    for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
      if (State.FuncCluster[Pred] == ClusterSucc) {
        auto A = State.Cg->Arcs.find(Arc(Pred, TargetId));

@ -389,7 +389,7 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) {
  for (auto TargetId : Into->Targets) {
    State.FuncCluster[TargetId] = Into;
    State.Addr[TargetId] = CurAddr;
-    CurAddr += State.Cg->Targets[TargetId].Size;
+    CurAddr += State.Cg->Nodes[TargetId].Size;
  }

  Other->Size = 0;
@ -400,29 +400,29 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) {
 /*
 * HFSortPlus - layout of hot functions with iTLB cache optimization
 */
-std::vector<Cluster> hfsortPlus(const TargetGraph &Cg) {
+std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
  // create a cluster for every function
  std::vector<Cluster> AllClusters;
-  AllClusters.reserve(Cg.Targets.size());
-  for (TargetId F = 0; F < Cg.Targets.size(); F++) {
-    AllClusters.emplace_back(F, Cg.Targets[F]);
+  AllClusters.reserve(Cg.Nodes.size());
+  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
+    AllClusters.emplace_back(F, Cg.Nodes[F]);
  }

  // initialize objects used by the algorithm
  std::vector<Cluster *> Clusters;
-  Clusters.reserve(Cg.Targets.size());
+  Clusters.reserve(Cg.Nodes.size());
  AlgoState State;
  State.Cg = &Cg;
  State.TotalSamples = 0;
-  State.FuncCluster = std::vector<Cluster *>(Cg.Targets.size(), nullptr);
-  State.Addr = std::vector<size_t>(Cg.Targets.size(), InvalidAddr);
-  for (TargetId F = 0; F < Cg.Targets.size(); F++) {
-    if (Cg.Targets[F].Samples == 0) continue;
+  State.FuncCluster = std::vector<Cluster *>(Cg.Nodes.size(), nullptr);
+  State.Addr = std::vector<size_t>(Cg.Nodes.size(), InvalidAddr);
+  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
+    if (Cg.Nodes[F].Samples == 0) continue;

    Clusters.push_back(&AllClusters[F]);
    State.FuncCluster[F] = &AllClusters[F];
    State.Addr[F] = 0;
-    State.TotalSamples += Cg.Targets[F].Samples;
+    State.TotalSamples += Cg.Nodes[F].Samples;
  }

  DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n"
--- a/bolt/Passes/PettisAndHansen.cpp
+++ b/bolt/Passes/PettisAndHansen.cpp
@ -0,0 +1,206 @@
+#include "HFSort.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+#include <unordered_map>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "hfsort"
+
+namespace llvm {
+namespace bolt {
+
+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;  
+
+namespace {
+class ClusterArc {
+public:
+  ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0)
+    : C1(std::min(Ca, Cb))
+    , C2(std::max(Ca, Cb))
+    , Weight(W)
+  {}
+
+  friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) {
+    return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2;
+  }
+
+  Cluster *const C1;
+  Cluster *const C2;
+  mutable double Weight;
+};
+
+class ClusterArcHash {
+public:
+  int64_t operator()(const ClusterArc &Arc) const {
+    std::hash<int64_t> Hasher;
+    return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2));
+  }
+};
+
+using ClusterArcSet = std::unordered_set<ClusterArc, ClusterArcHash>;
+
+void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) {
+  auto C1head = C1->Targets.front();
+  auto C1tail = C1->Targets.back();
+  auto C2head = C2->Targets.front();
+  auto C2tail = C2->Targets.back();
+
+  double C1headC2head = 0;
+  double C1headC2tail = 0;
+  double C1tailC2head = 0;
+  double C1tailC2tail = 0;
+
+  for (const auto &Arc : Cg.Arcs) {
+    if ((Arc.Src == C1head && Arc.Dst == C2head) ||
+        (Arc.Dst == C1head && Arc.Src == C2head)) {
+      C1headC2head += Arc.Weight;
+    } else if ((Arc.Src == C1head && Arc.Dst == C2tail) ||
+               (Arc.Dst == C1head && Arc.Src == C2tail)) {
+      C1headC2tail += Arc.Weight;
+    } else if ((Arc.Src == C1tail && Arc.Dst == C2head) ||
+               (Arc.Dst == C1tail && Arc.Src == C2head)) {
+      C1tailC2head += Arc.Weight;
+    } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) ||
+               (Arc.Dst == C1tail && Arc.Src == C2tail)) {
+      C1tailC2tail += Arc.Weight;
+    }
+  }
+
+  const double Max = std::max(std::max(C1headC2head, C1headC2tail),
+                              std::max(C1tailC2head, C1tailC2tail));
+
+  if (C1headC2head == Max) {
+    // flip C1
+    std::reverse(C1->Targets.begin(), C1->Targets.end());
+  } else if (C1headC2tail == Max) {
+    // flip C1 C2
+    std::reverse(C1->Targets.begin(), C1->Targets.end());
+    std::reverse(C2->Targets.begin(), C2->Targets.end());
+  } else if (C1tailC2tail == Max) {
+    // flip C2
+    std::reverse(C2->Targets.begin(), C2->Targets.end());
+  }
+}
+}
+
+std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {
+  // indexed by NodeId, keeps its current cluster
+  std::vector<Cluster*> FuncCluster(Cg.Nodes.size(), nullptr);
+  std::vector<Cluster> Clusters;
+  std::vector<NodeId> Funcs;
+
+  Clusters.reserve(Cg.Nodes.size());
+
+  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
+    if (Cg.Nodes[F].Samples == 0) continue;
+    Clusters.emplace_back(F, Cg.Nodes[F]);
+    FuncCluster[F] = &Clusters.back();
+    Funcs.push_back(F);
+  }
+
+  ClusterArcSet Carcs;
+
+  auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) {
+    auto Res = Carcs.emplace(C1, C2, Weight);
+    if (!Res.second) {
+      Res.first->Weight += Weight;
+    }
+  };
+
+  // Create a std::vector of cluster arcs
+
+  for (auto &Arc : Cg.Arcs) {
+    if (Arc.Weight == 0) continue;
+
+    auto const S = FuncCluster[Arc.Src];
+    auto const D = FuncCluster[Arc.Dst];
+
+    // ignore if s or d is nullptr
+
+    if (S == nullptr || D == nullptr) continue;
+
+    // ignore self-edges
+
+    if (S == D) continue;
+
+    insertOrInc(S, D, Arc.Weight);
+  }
+
+  // Find an arc with max weight and merge its nodes
+
+  while (!Carcs.empty()) {
+    auto Maxpos = std::max_element(
+      Carcs.begin(),
+      Carcs.end(),
+      [&] (const ClusterArc &Carc1, const ClusterArc &Carc2) {
+        return Carc1.Weight < Carc2.Weight;
+      }
+    );
+
+    auto Max = *Maxpos;
+    Carcs.erase(Maxpos);
+
+    auto const C1 = Max.C1;
+    auto const C2 = Max.C2;
+
+    if (C1->Size + C2->Size > MaxClusterSize) continue;
+
+    if (C1->Frozen || C2->Frozen) continue;
+
+    // order functions and merge cluster
+
+    orderFuncs(Cg, C1, C2);
+
+    DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(),
+          C1->toString().c_str(), Max.Weight););
+
+    // update carcs: merge C1arcs to C2arcs
+
+    std::unordered_map<ClusterArc, Cluster *, ClusterArcHash> C2arcs;
+    for (auto &Carc : Carcs) {
+      if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2);
+      if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1);
+    }
+
+    for (auto It : C2arcs) {
+      auto const C = It.second;
+      auto const C2arc = It.first;
+
+      insertOrInc(C, C1, C2arc.Weight);
+      Carcs.erase(C2arc);
+    }
+
+    // update FuncCluster
+
+    for (auto F : C2->Targets) {
+      FuncCluster[F] = C1;
+    }
+    C1->merge(std::move(*C2), Max.Weight);
+  }
+
+  // Return the set of Clusters that are left, which are the ones that
+  // didn't get merged.
+
+  std::set<Cluster*> LiveClusters;
+  std::vector<Cluster> OutClusters;
+
+  for (auto Fid : Funcs) {
+    LiveClusters.insert(FuncCluster[Fid]);
+  }
+  for (auto C : LiveClusters) {
+    OutClusters.push_back(std::move(*C));
+  }
+
+  std::sort(OutClusters.begin(),
+            OutClusters.end(),
+            compareClustersDensity);
+
+  return OutClusters;
+}
+
+}
+}
--- a/bolt/Passes/ReorderFunctions.cpp
+++ b/bolt/Passes/ReorderFunctions.cpp
@ -0,0 +1,406 @@
+//===--- ReorderFunctions.cpp - Function reordering pass ------------ -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReorderFunctions.h"
+#include "llvm/Support/Options.h"
+#include <fstream>
+
+#define DEBUG_TYPE "hfsort"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<bool> Relocs;
+extern cl::opt<uint32_t> RandomSeed;
+
+extern bool shouldProcess(const bolt::BinaryFunction &Function);
+extern size_t padFunction(const bolt::BinaryFunction &Function);
+
+cl::opt<bolt::BinaryFunction::ReorderType>
+ReorderFunctions("reorder-functions",
+  cl::desc("reorder and cluster functions (works only with relocations)"),
+  cl::init(bolt::BinaryFunction::RT_NONE),
+  cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE,
+      "none",
+      "do not reorder functions"),
+    clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT,
+      "exec-count",
+      "order by execution count"),
+    clEnumValN(bolt::BinaryFunction::RT_HFSORT,
+      "hfsort",
+      "use hfsort algorithm"),
+    clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS,
+      "hfsort+",
+      "use hfsort+ algorithm"),
+    clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN,
+      "pettis-hansen",
+      "use Pettis-Hansen algorithm"),
+    clEnumValN(bolt::BinaryFunction::RT_RANDOM,
+      "random",
+      "reorder functions randomly"),
+    clEnumValN(bolt::BinaryFunction::RT_USER,
+      "user",
+      "use function order specified by -function-order"),
+    clEnumValEnd),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+ReorderFunctionsUseHotSize("reorder-functions-use-hot-size",
+  cl::desc("use a function's hot size when doing clustering"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<std::string>
+FunctionOrderFile("function-order",
+  cl::desc("file containing an ordered list of functions to use for function "
+           "reordering"),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<std::string>
+GenerateFunctionOrderFile("generate-function-order",
+  cl::desc("file to dump the ordered list of functions to use for function "
+           "reordering"),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+UseEdgeCounts("use-edge-counts",
+  cl::desc("use edge count data when doing clustering"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;  
+
+void ReorderFunctions::normalizeArcWeights() {
+  // Normalize arc weights.
+  if (!opts::UseEdgeCounts) {
+    for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) {
+      auto& Func = Cg.Nodes[FuncId];
+      for (auto Caller : Func.Preds) {
+        auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
+        A.NormalizedWeight = A.Weight / Func.Samples;
+        A.AvgCallOffset /= A.Weight;
+        assert(A.AvgCallOffset < Cg.Nodes[Caller].Size);
+      }
+    }
+  } else {
+    for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) {
+      auto &Func = Cg.Nodes[FuncId];
+      for (auto Caller : Func.Preds) {
+        auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
+        A.NormalizedWeight = A.Weight / Func.Samples;
+      }
+    }
+  }
+}
+
+void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
+                               std::map<uint64_t, BinaryFunction> &BFs) {
+  std::vector<uint64_t> FuncAddr(Cg.Nodes.size());  // Just for computing stats
+  uint64_t TotalSize = 0;
+  uint32_t Index = 0;
+
+  // Set order of hot functions based on clusters.
+  for (const auto& Cluster : Clusters) {
+    for (const auto FuncId : Cluster.Targets) {
+      assert(Cg.Nodes[FuncId].Samples > 0);
+      Cg.Funcs[FuncId]->setIndex(Index++);
+      FuncAddr[FuncId] = TotalSize;
+      TotalSize += Cg.Nodes[FuncId].Size;
+    }
+  }
+
+  if (opts::ReorderFunctions == BinaryFunction::RT_NONE)
+    return;
+
+  if (opts::Verbosity == 0) {
+#ifndef NDEBUG
+    if (!DebugFlag || !isCurrentDebugType("hfsort"))
+      return;
+#else
+    return;
+#endif
+  }
+
+  TotalSize   = 0;
+  uint64_t CurPage     = 0;
+  uint64_t Hotfuncs    = 0;
+  double TotalDistance = 0;
+  double TotalCalls    = 0;
+  double TotalCalls64B = 0;
+  double TotalCalls4KB = 0;
+  double TotalCalls2MB = 0;
+  dbgs() << "============== page 0 ==============\n";
+  for (auto& Cluster : Clusters) {
+    dbgs() <<
+      format("-------- density = %.3lf (%u / %u) --------\n",
+             (double) Cluster.Samples / Cluster.Size,
+             Cluster.Samples, Cluster.Size);
+
+    for (auto FuncId : Cluster.Targets) {
+      if (Cg.Nodes[FuncId].Samples > 0) {
+        Hotfuncs++;
+
+        dbgs() << "BOLT-INFO: hot func " << *Cg.Funcs[FuncId]
+               << " (" << Cg.Nodes[FuncId].Size << ")\n";
+
+        uint64_t Dist = 0;
+        uint64_t Calls = 0;
+        for (auto Dst : Cg.Nodes[FuncId].Succs) {
+          auto& A = *Cg.Arcs.find(Arc(FuncId, Dst));
+          auto D =
+            std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset));
+          auto W = A.Weight;
+          Calls += W;
+          if (D < 64)        TotalCalls64B += W;
+          if (D < 4096)      TotalCalls4KB += W;
+          if (D < (2 << 20)) TotalCalls2MB += W;
+          Dist += A.Weight * D;
+          dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
+                           "weight = %.0lf, callDist = %f\n",
+                           A.Src, FuncAddr[A.Src], A.AvgCallOffset,
+                           A.Dst, FuncAddr[A.Dst], A.Weight, D);
+        }
+        TotalCalls += Calls;
+        TotalDistance += Dist;
+        dbgs() << format("start = %6u : avgCallDist = %lu : %s\n",
+                         TotalSize,
+                         Calls ? Dist / Calls : 0,
+                         Cg.Funcs[FuncId]->getPrintName().c_str());
+        TotalSize += Cg.Nodes[FuncId].Size;
+        auto NewPage = TotalSize / HugePageSize;
+        if (NewPage != CurPage) {
+          CurPage = NewPage;
+          dbgs() << format("============== page %u ==============\n", CurPage);
+        }
+      }
+    }
+  }
+  dbgs() << format("  Number of hot functions: %u\n"
+                   "  Number of clusters: %lu\n",
+                   Hotfuncs, Clusters.size())
+         << format("  Final average call distance = %.1lf (%.0lf / %.0lf)\n",
+                   TotalCalls ? TotalDistance / TotalCalls : 0,
+                   TotalDistance, TotalCalls)
+         << format("  Total Calls = %.0lf\n", TotalCalls);
+  if (TotalCalls) {
+    dbgs() << format("  Total Calls within 64B = %.0lf (%.2lf%%)\n",
+                     TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
+           << format("  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
+                     TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
+           << format("  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
+                     TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
+  }
+}
+
+namespace {
+
+std::vector<std::string> readFunctionOrderFile() {
+  std::vector<std::string> FunctionNames;
+  std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
+  if (!FuncsFile) {
+    errs() << "Ordered functions file \"" << opts::FunctionOrderFile
+           << "\" can't be opened.\n";
+    exit(1);
+  }
+  std::string FuncName;
+  while (std::getline(FuncsFile, FuncName)) {
+    FunctionNames.push_back(FuncName);
+  }
+  return FunctionNames;
+}
+
+}
+
+void ReorderFunctions::runOnFunctions(BinaryContext &BC,
+                                      std::map<uint64_t, BinaryFunction> &BFs,
+                                      std::set<uint64_t> &LargeFunctions) {
+  if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) {
+    errs() << "BOLT-ERROR: Function reordering only works when "
+           << "relocs are enabled.\n";
+    exit(1);
+  }
+
+  if (opts::ReorderFunctions != BinaryFunction::RT_NONE &&
+      opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT &&
+      opts::ReorderFunctions != BinaryFunction::RT_USER) {
+    Cg = buildCallGraph(BC,
+                        BFs,
+                        [this](const BinaryFunction &BF) {
+                          return !shouldOptimize(BF) || !BF.hasProfile();
+                        },
+                        false, // IncludeColdCalls
+                        opts::ReorderFunctionsUseHotSize,
+                        opts::UseEdgeCounts);
+    normalizeArcWeights();
+  }
+
+  std::vector<Cluster> Clusters;
+
+  switch(opts::ReorderFunctions) {
+  case BinaryFunction::RT_NONE:
+    break;
+  case BinaryFunction::RT_EXEC_COUNT:
+    {
+      std::vector<BinaryFunction *> SortedFunctions(BFs.size());
+      uint32_t Index = 0;
+      std::transform(BFs.begin(),
+                     BFs.end(),
+                     SortedFunctions.begin(),
+                     [](std::pair<const uint64_t, BinaryFunction> &BFI) {
+                       return &BFI.second;
+                     });
+      std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(),
+                       [&](const BinaryFunction *A, const BinaryFunction *B) {
+                         if (!opts::shouldProcess(*A))
+                           return false;
+                         const auto PadA = opts::padFunction(*A);
+                         const auto PadB = opts::padFunction(*B);
+                         if (!PadA || !PadB) {
+                           if (PadA)
+                             return true;
+                           if (PadB)
+                             return false;
+                         }
+                         return !A->hasProfile() &&
+                           (B->hasProfile() ||
+                            (A->getExecutionCount() > B->getExecutionCount()));
+                       });
+      for (auto *BF : SortedFunctions) {
+        if (BF->hasProfile())
+          BF->setIndex(Index++);
+      }
+    }
+    break;
+  case BinaryFunction::RT_HFSORT:
+    Clusters = clusterize(Cg);
+    break;
+  case BinaryFunction::RT_HFSORT_PLUS:
+    Clusters = hfsortPlus(Cg);
+    break;
+  case BinaryFunction::RT_PETTIS_HANSEN:
+    Clusters = pettisAndHansen(Cg);
+    break;
+  case BinaryFunction::RT_RANDOM:
+    std::srand(opts::RandomSeed);
+    Clusters = randomClusters(Cg);
+    break;
+  case BinaryFunction::RT_USER:
+    {
+      uint32_t Index = 0;
+      for (const auto &Function : readFunctionOrderFile()) {
+        std::vector<uint64_t> FuncAddrs;
+
+        auto Itr = BC.GlobalSymbols.find(Function);
+        if (Itr == BC.GlobalSymbols.end()) {
+          uint32_t LocalID = 1;
+          while(1) {
+            // If we can't find the main symbol name, look for alternates.
+            Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID));
+            if (Itr != BC.GlobalSymbols.end())
+              FuncAddrs.push_back(Itr->second);
+            else
+              break;
+            LocalID++;
+          }
+        } else {
+          FuncAddrs.push_back(Itr->second);
+        }
+
+        if (FuncAddrs.empty()) {
+          errs() << "BOLT-WARNING: Reorder functions: can't find function for "
+                 << Function << ".\n";
+          continue;
+        }
+
+        for (const auto FuncAddr : FuncAddrs) {
+          const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat");
+          assert(FuncSym);
+
+          auto *BF = BC.getFunctionForSymbol(FuncSym);
+          if (!BF) {
+            errs() << "BOLT-WARNING: Reorder functions: can't find function for "
+                   << Function << ".\n";
+            break;
+          }
+          if (!BF->hasValidIndex()) {
+            BF->setIndex(Index++);
+          } else if (opts::Verbosity > 0) {
+            errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n";
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  reorder(std::move(Clusters), BFs);
+
+  if (!opts::GenerateFunctionOrderFile.empty()) {
+    std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out);
+    if (!FuncsFile) {
+      errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile
+             << "\" can't be opened.\n";
+      exit(1);
+    }
+
+    std::vector<BinaryFunction *> SortedFunctions(BFs.size());
+
+    std::transform(BFs.begin(),
+                   BFs.end(),
+                   SortedFunctions.begin(),
+                   [](std::pair<const uint64_t, BinaryFunction> &BFI) {
+                     return &BFI.second;
+                   });
+
+    // Sort functions by index.
+    std::stable_sort(
+      SortedFunctions.begin(),
+      SortedFunctions.end(),
+      [](const BinaryFunction *A, const BinaryFunction *B) {
+        if (A->hasValidIndex() && B->hasValidIndex()) {
+          return A->getIndex() < B->getIndex();
+        } else if (A->hasValidIndex() && !B->hasValidIndex()) {
+          return true;
+        } else if (!A->hasValidIndex() && B->hasValidIndex()) {
+          return false;
+        } else {
+          return A->getAddress() < B->getAddress();
+        }
+      });
+
+    for (const auto *Func : SortedFunctions) {
+      if (!Func->hasValidIndex())
+        break;
+      FuncsFile << Func->getSymbol()->getName().data() << "\n";
+    }
+    FuncsFile.close();
+
+    outs() << "BOLT-INFO: dumped function order to \""
+           << opts::GenerateFunctionOrderFile << "\"\n";
+
+    exit(0);
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
--- a/bolt/Passes/ReorderFunctions.h
+++ b/bolt/Passes/ReorderFunctions.h
@ -0,0 +1,43 @@
+//===--- ReorderFunctions.h - Function reordering pass --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H
+
+#include "BinaryPasses.h"
+#include "HFSort.h"
+
+namespace llvm {
+namespace bolt {
+
+/// Modify function order for streaming based on hotness.
+class ReorderFunctions : public BinaryFunctionPass {
+  CallGraph Cg;
+
+  void normalizeArcWeights();
+  void reorder(std::vector<Cluster> &&Clusters,
+               std::map<uint64_t, BinaryFunction> &BFs);
+ public:
+  explicit ReorderFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "reorder-functions";
+  }
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif