[BOLT] More CG refactoring

Summary: Do some additional refactoring of the CallGraph class. Add a BinaryFunctionCallGraph class that has the BOLT specific bits. This is in preparation to moving the generic CallGraph class into a library that both BOLT and HHVM can use. Make data members of CallGraph private and add the appropriate accessor methods. (cherry picked from FBD5143468)
2017-05-26 15:46:46 -07:00 · 2017-05-26 15:46:46 -07:00 · 5feee9f1d8
parent 95ab659fe4
commit 5feee9f1d8
16 changed files with 653 additions and 436 deletions
--- a/bolt/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/Passes/BinaryFunctionCallGraph.cpp
@ -0,0 +1,195 @@
+//===--- Passes/BinaryFunctionCallGraph.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryFunctionCallGraph.h"
+#include "BinaryFunction.h"
+#include "BinaryContext.h"
+
+#define DEBUG_TYPE "callgraph"
+
+namespace llvm {
+namespace bolt {
+
+CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF,
+                                                   uint32_t Size,
+                                                   uint64_t Samples) {
+  auto Id = CallGraph::addNode(Size, Samples);
+  assert(size_t(Id) == Funcs.size());
+  Funcs.push_back(BF);
+  FuncToNodeId[BF] = Id;
+  assert(Funcs[Id] == BF);
+  return Id;
+}
+
+std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
+  std::deque<BinaryFunction *> TopologicalOrder;
+  enum NodeStatus { NEW, VISITING, VISITED };
+  std::vector<NodeStatus> NodeStatus(Funcs.size());
+  std::stack<NodeId> Worklist;
+
+  for (auto *Func : Funcs) {
+    const auto Id = FuncToNodeId.at(Func);
+    Worklist.push(Id);
+    NodeStatus[Id] = NEW;
+  }
+
+  while (!Worklist.empty()) {
+    const auto FuncId = Worklist.top();
+    Worklist.pop();
+
+    if (NodeStatus[FuncId] == VISITED)
+      continue;
+
+    if (NodeStatus[FuncId] == VISITING) {
+      TopologicalOrder.push_back(Funcs[FuncId]);
+      NodeStatus[FuncId] = VISITED;
+      continue;
+    }
+
+    assert(NodeStatus[FuncId] == NEW);
+    NodeStatus[FuncId] = VISITING;
+    Worklist.push(FuncId);
+    for (const auto Callee : successors(FuncId)) {
+      if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
+        continue;
+      Worklist.push(Callee);
+    }
+  }
+
+  return TopologicalOrder;
+}
+
+BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
+                                       std::map<uint64_t, BinaryFunction> &BFs,
+                                       CgFilterFunction Filter,
+                                       bool IncludeColdCalls,
+                                       bool UseFunctionHotSize,
+                                       bool UseEdgeCounts) {
+  BinaryFunctionCallGraph Cg;
+
+  // Add call graph nodes.
+  auto lookupNode = [&](BinaryFunction *Function) {
+    const auto Id = Cg.maybeGetNodeId(Function);
+    if (Id == CallGraph::InvalidId) {
+      // It's ok to use the hot size here when the function is split.  This is
+      // because emitFunctions will emit the hot part first in the order that is
+      // computed by ReorderFunctions.  The cold part will be emitted with the
+      // rest of the cold functions and code.
+      const auto Size = UseFunctionHotSize && Function->isSplit()
+        ? Function->estimateHotSize()
+        : Function->estimateSize();
+      // NOTE: for functions without a profile, we set the number of samples
+      // to zero.  This will keep these functions from appearing in the hot
+      // section.  This is a little weird because we wouldn't be trying to
+      // create a node for a function unless it was the target of a call from
+      // a hot block.  The alternative would be to set the count to one or
+      // accumulate the number of calls from the callsite into the function
+      // samples.  Results from perfomance testing seem to favor the zero
+      // count though, so I'm leaving it this way for now.
+      const auto Samples =
+        Function->hasProfile() ? Function->getExecutionCount() : 0;
+      return Cg.addNode(Function, Size, Samples);
+    } else {
+      return Id;
+    }
+  };
+
+  // Add call graph edges.
+  uint64_t NotProcessed = 0;
+  uint64_t TotalCalls = 0;
+  for (auto &It : BFs) {
+    auto *Function = &It.second;
+
+    if(Filter(*Function)) {
+      continue;
+    }
+
+    auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames());
+    const auto SrcId = lookupNode(Function);
+    uint64_t Offset = Function->getAddress();
+
+    auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
+      if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) {
+        const auto DstId = lookupNode(DstFunc);
+        const auto AvgDelta = !UseEdgeCounts ? Offset - DstFunc->getAddress() : 0;
+        Cg.incArcWeight(SrcId, DstId, Count, AvgDelta);
+        DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
+              << " -> " << *DstFunc << " @ " << Offset << "\n");
+        return true;
+      }
+      return false;
+    };
+
+    for (auto *BB : Function->layout()) {
+      // Don't count calls from cold blocks
+      if (BB->isCold() && !IncludeColdCalls)
+        continue;
+
+      for (auto &Inst : *BB) {
+        // Find call instructions and extract target symbols from each one.
+        if (!BC.MIA->isCall(Inst))
+          continue;
+
+        ++TotalCalls;
+        if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) {
+          // For direct calls, just use the BB execution count.
+          const auto Count = UseEdgeCounts && BB->hasProfile()
+                           ? BB->getExecutionCount() : 1;
+          if (!recordCall(DstSym, Count))
+            ++NotProcessed;
+        } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) {
+          // For indirect calls and jump tables, use branch data.
+          if (!BranchDataOrErr) {
+            ++NotProcessed;
+            continue;
+          }
+          const FuncBranchData &BranchData = BranchDataOrErr.get();
+          const auto DataOffset =
+            BC.MIA->getAnnotationAs<uint64_t>(Inst, "EdgeCountData");
+
+          for (const auto &BI : BranchData.getBranchRange(DataOffset)) {
+            // Count each target as a separate call.
+            ++TotalCalls;
+
+            if (!BI.To.IsSymbol) {
+              ++NotProcessed;
+              continue;
+            }
+
+            auto Itr = BC.GlobalSymbols.find(BI.To.Name);
+            if (Itr == BC.GlobalSymbols.end()) {
+              ++NotProcessed;
+              continue;
+            }
+
+            const auto *DstSym =
+              BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat");
+
+            if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1))
+              ++NotProcessed;
+          }
+        }
+
+        if (!UseEdgeCounts) {
+          Offset += BC.computeCodeSize(&Inst, &Inst + 1);
+        }
+      }
+    }
+  }
+
+  outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed
+         << " callsites not processed out of " << TotalCalls << "\n";
+
+  return Cg;
+}
+
+}
+}
--- a/bolt/Passes/BinaryFunctionCallGraph.h
+++ b/bolt/Passes/BinaryFunctionCallGraph.h
@ -0,0 +1,80 @@
+//===--- Passes/CallGraph.h -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
+
+#include "CallGraph.h"
+
+#include <unordered_map>
+#include <functional>
+#include <deque>
+#include <map>
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+class BinaryContext;
+
+class BinaryFunctionCallGraph : public CallGraph {
+public:
+  NodeId maybeGetNodeId(const BinaryFunction *BF) const {
+    auto Itr = FuncToNodeId.find(BF);
+    return Itr != FuncToNodeId.end() ? Itr->second : InvalidId;
+  }
+  NodeId getNodeId(const BinaryFunction *BF) const {
+    auto Itr = FuncToNodeId.find(BF);
+    assert(Itr != FuncToNodeId.end());
+    return Itr->second;
+  }
+  BinaryFunction *nodeIdToFunc(NodeId Id) {
+    assert(Id < Funcs.size());
+    return Funcs[Id];
+  }
+  const BinaryFunction *nodeIdToFunc(NodeId Id) const {
+    assert(Id < Funcs.size());
+    return Funcs[Id];
+  }
+  NodeId addNode(BinaryFunction *BF, uint32_t Size, uint64_t Samples = 0);
+
+  /// Compute a DFS traversal of the call graph.
+  std::deque<BinaryFunction *> buildTraversalOrder();
+
+private:
+  std::unordered_map<const BinaryFunction *, NodeId> FuncToNodeId;
+  std::vector<BinaryFunction *> Funcs;
+};
+
+using CgFilterFunction = std::function<bool (const BinaryFunction &BF)>;
+inline bool NoFilter(const BinaryFunction &) { return false; }
+
+/// Builds a call graph from the map of BinaryFunctions provided in BFs.
+/// The arguments control how the graph is constructed.
+/// Filter is called on each function, any function that it returns true for
+/// is omitted from the graph.
+/// If IncludeColdCalls is true, then calls from cold BBs are considered for the
+/// graph, otherwise they are ignored.
+/// UseFunctionHotSize controls whether the hot size of a function is used when
+/// filling in the Size attribute of new Nodes.
+/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is
+/// computed using the offsets of call instructions.
+BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
+                                       std::map<uint64_t, BinaryFunction> &BFs,
+                                       CgFilterFunction Filter = NoFilter,
+                                       bool IncludeColdCalls = true,
+                                       bool UseFunctionHotSize = false,
+                                       bool UseEdgeCounts = false);
+
+}
+}
+
+#endif
--- a/bolt/Passes/CMakeLists.txt
+++ b/bolt/Passes/CMakeLists.txt
@ -1,5 +1,6 @@
 add_llvm_library(LLVMBOLTPasses
  BinaryPasses.cpp
+  BinaryFunctionCallGraph.cpp
  CallGraph.cpp
  DataflowAnalysis.cpp
  DataflowInfoManager.cpp
--- a/bolt/Passes/CallGraph.cpp
+++ b/bolt/Passes/CallGraph.cpp
@ -67,195 +67,52 @@ namespace bolt {
 int64_t CallGraph::Arc::Hash::operator()(const Arc &Arc) const {
 #ifdef USE_STD_HASH
  std::hash<int64_t> Hasher;
-  return hashCombine(Hasher(Arc.Src), Arc.Dst);
+  return hashCombine(Hasher(Arc.src()), Arc.dst());
 #else
-  return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst));
+  return hash_int64_pair(int64_t(Arc.src()), int64_t(Arc.dst()));
 #endif
 }

-CallGraph buildCallGraph(BinaryContext &BC,
-                         std::map<uint64_t, BinaryFunction> &BFs,
-                         std::function<bool (const BinaryFunction &BF)> Filter,
-                         bool IncludeColdCalls,
-                         bool UseFunctionHotSize,
-                         bool UseEdgeCounts) {
-  CallGraph Cg;
-
-  // Add call graph nodes.
-  auto lookupNode = [&](BinaryFunction *Function) {
-    auto It = Cg.FuncToNodeId.find(Function);
-    if (It == Cg.FuncToNodeId.end()) {
-      // It's ok to use the hot size here when the function is split.  This is
-      // because emitFunctions will emit the hot part first in the order that is
-      // computed by ReorderFunctions.  The cold part will be emitted with the
-      // rest of the cold functions and code.
-      const auto Size = UseFunctionHotSize && Function->isSplit()
-        ? Function->estimateHotSize()
-        : Function->estimateSize();
-      const auto Id = Cg.addNode(Size);
-      assert(size_t(Id) == Cg.Funcs.size());
-      Cg.Funcs.push_back(Function);
-      Cg.FuncToNodeId[Function] = Id;
-      // NOTE: for functions without a profile, we set the number of samples
-      // to zero.  This will keep these functions from appearing in the hot
-      // section.  This is a little weird because we wouldn't be trying to
-      // create a node for a function unless it was the target of a call from
-      // a hot block.  The alternative would be to set the count to one or
-      // accumulate the number of calls from the callsite into the function
-      // samples.  Results from perfomance testing seem to favor the zero
-      // count though, so I'm leaving it this way for now.
-      Cg.Nodes[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0;
-      assert(Cg.Funcs[Id] == Function);
-      return Id;
-    } else {
-      return It->second;
-    }
-  };
-
-  // Add call graph edges.
-  uint64_t NotProcessed = 0;
-  uint64_t TotalCalls = 0;
-  for (auto &It : BFs) {
-    auto *Function = &It.second;
-
-    if(Filter(*Function)) {
-      continue;
-    }
-
-    auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames());
-    const auto SrcId = lookupNode(Function);
-    uint64_t Offset = Function->getAddress();
-
-    auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
-      if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) {
-        const auto DstId = lookupNode(DstFunc);
-        auto &A = Cg.incArcWeight(SrcId, DstId, Count);
-        if (!UseEdgeCounts) {
-          A.AvgCallOffset += (Offset - DstFunc->getAddress());
-        }
-        DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
-              << " -> " << *DstFunc << " @ " << Offset << "\n");
-        return true;
-      }
-      return false;
-    };
-
-    for (auto *BB : Function->layout()) {
-      // Don't count calls from cold blocks
-      if (BB->isCold() && !IncludeColdCalls)
-        continue;
-
-      for (auto &Inst : *BB) {
-        // Find call instructions and extract target symbols from each one.
-        if (!BC.MIA->isCall(Inst))
-          continue;
-
-        ++TotalCalls;
-        if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) {
-          // For direct calls, just use the BB execution count.
-          const auto Count = UseEdgeCounts && BB->hasProfile()
-                           ? BB->getExecutionCount() : 1;
-          if (!recordCall(DstSym, Count))
-            ++NotProcessed;
-        } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) {
-          // For indirect calls and jump tables, use branch data.
-          if(!BranchDataOrErr) {
-            ++NotProcessed;
-            continue;
-          }
-          const FuncBranchData &BranchData = BranchDataOrErr.get();
-          const auto DataOffset =
-            BC.MIA->getAnnotationAs<uint64_t>(Inst, "EdgeCountData");
-
-          for (const auto &BI : BranchData.getBranchRange(DataOffset)) {
-            // Count each target as a separate call.
-            ++TotalCalls;
-
-            if (!BI.To.IsSymbol) {
-              ++NotProcessed;
-              continue;
-            }
-
-            auto Itr = BC.GlobalSymbols.find(BI.To.Name);
-            if (Itr == BC.GlobalSymbols.end()) {
-              ++NotProcessed;
-              continue;
-            }
-
-            const auto *DstSym =
-              BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat");
-
-            if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1))
-              ++NotProcessed;
-          }
-        }
-
-        if (!UseEdgeCounts) {
-          Offset += BC.computeCodeSize(&Inst, &Inst + 1);
-        }
-      }
-    }
-  }
-
-  outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed
-         << " callsites not processed out of " << TotalCalls << "\n";
-
-  return Cg;
-}
-
-CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint32_t Samples) {
+CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint64_t Samples) {
  auto Id = Nodes.size();
  Nodes.emplace_back(Size, Samples);
  return Id;
 }

-const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W) {
+const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W,
+                                              double Offset) {
  auto Res = Arcs.emplace(Src, Dst, W);
  if (!Res.second) {
    Res.first->Weight += W;
    return *Res.first;
  }
+  Res.first->AvgCallOffset += Offset;
  Nodes[Src].Succs.push_back(Dst);
  Nodes[Dst].Preds.push_back(Src);
  return *Res.first;
 }

-std::deque<BinaryFunction *> CallGraph::buildTraversalOrder() {
-  std::deque<BinaryFunction *> TopologicalOrder;
-  enum NodeStatus { NEW, VISITING, VISITED };
-  std::vector<NodeStatus> NodeStatus(Funcs.size());
-  std::stack<NodeId> Worklist;
-
-  for (auto *Func : Funcs) {
-    const auto Id = FuncToNodeId.at(Func);
-    Worklist.push(Id);
-    NodeStatus[Id] = NEW;
-  }
-
-  while (!Worklist.empty()) {
-    const auto FuncId = Worklist.top();
-    Worklist.pop();
-
-    if (NodeStatus[FuncId] == VISITED)
-      continue;
-
-    if (NodeStatus[FuncId] == VISITING) {
-      TopologicalOrder.push_back(Funcs[FuncId]);
-      NodeStatus[FuncId] = VISITED;
-      continue;
+void CallGraph::normalizeArcWeights(bool UseEdgeCounts) {
+  // Normalize arc weights.
+  if (!UseEdgeCounts) {
+    for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) {
+      auto& Func = getNode(FuncId);
+      for (auto Caller : Func.predecessors()) {
+        auto Arc = findArc(Caller, FuncId);
+        Arc->NormalizedWeight = Arc->weight() / Func.samples();
+        Arc->AvgCallOffset /= Arc->weight();
+        assert(Arc->AvgCallOffset < size(Caller));
+      }
    }
-
-    assert(NodeStatus[FuncId] == NEW);
-    NodeStatus[FuncId] = VISITING;
-    Worklist.push(FuncId);
-    for (const auto Callee : Nodes[FuncId].Succs) {
-      if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
-        continue;
-      Worklist.push(Callee);
+  } else {
+    for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) {
+      auto &Func = getNode(FuncId);
+      for (auto Caller : Func.predecessors()) {
+        auto Arc = findArc(Caller, FuncId);
+        Arc->NormalizedWeight = Arc->weight() / Func.samples();
+      }
    }
  }
-
-  return TopologicalOrder;
 }

 }
--- a/bolt/Passes/CallGraph.h
+++ b/bolt/Passes/CallGraph.h
@ -12,20 +12,14 @@
 #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H

+#include <cassert>
 #include <string>
 #include <unordered_set>
-#include <unordered_map>
 #include <vector>
-#include <functional>
-#include <map>
-#include <deque>

 namespace llvm {
 namespace bolt {

-class BinaryFunction;
-class BinaryContext;
-
 // TODO: find better place for this
 inline int64_t hashCombine(const int64_t Seed, const int64_t Val) {
  std::hash<int64_t> Hasher;
@ -55,6 +49,14 @@ public:
      return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst;
    }

+    NodeId src() const { return Src; }
+    NodeId dst() const { return Dst; }
+    double weight() const { return Weight; }
+    double avgCallOffset() const { return AvgCallOffset; }
+    double normalizedWeight() const { return NormalizedWeight; }
+
+  private:
+    friend class CallGraph;
    const NodeId Src;
    const NodeId Dst;
    mutable double Weight;
@ -62,50 +64,115 @@ public:
    mutable double AvgCallOffset{0};
  };

+  using ArcsType = std::unordered_set<Arc, Arc::Hash>;
+  using ArcIterator = ArcsType::iterator;
+  using ArcConstIterator = ArcsType::const_iterator;
+
  class Node {
  public:
-    explicit Node(uint32_t Size, uint32_t Samples = 0)
+    explicit Node(uint32_t Size, uint64_t Samples = 0)
      : Size(Size), Samples(Samples)
    {}

+    uint32_t size() const { return Size; }
+    uint64_t samples() const { return Samples; }
+
+    const std::vector<NodeId> &successors() const {
+      return Succs;
+    }
+    const std::vector<NodeId> &predecessors() const {
+      return Preds;
+    }
+
+  private:
+    friend class CallGraph;
    uint32_t Size;
-    uint32_t Samples;
+    uint64_t Samples;

    // preds and succs contain no duplicate elements and self arcs are not allowed
    std::vector<NodeId> Preds;
    std::vector<NodeId> Succs;
  };

-  NodeId addNode(uint32_t Size, uint32_t Samples = 0);
-  const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0);
+  size_t numNodes() const {
+    return Nodes.size();
+  }
+  const Node &getNode(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id];
+  }
+  uint32_t size(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Size;
+  }
+  uint64_t samples(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Samples;
+  }
+  const std::vector<NodeId> &successors(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Succs;
+  }
+  const std::vector<NodeId> &predecessors(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Preds;
+  }
+  NodeId addNode(uint32_t Size, uint64_t Samples = 0);
+  const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0,
+                          double Offset = 0.0);
+  ArcIterator findArc(NodeId Src, NodeId Dst) {
+    return Arcs.find(Arc(Src, Dst));
+  }
+  ArcConstIterator findArc(NodeId Src, NodeId Dst) const {
+    return Arcs.find(Arc(Src, Dst));
+  }
+  const ArcsType &getArcs() const {
+    return Arcs;
+  }

-  /// Compute a DFS traversal of the call graph.
-  std::deque<BinaryFunction *> buildTraversalOrder();
+  void normalizeArcWeights(bool UseEdgeCounts);

+  template <typename L>
+  void printDot(char* fileName, L getLabel) const;
+private:
  std::vector<Node> Nodes;
-  std::unordered_set<Arc, Arc::Hash> Arcs;
-  std::vector<BinaryFunction *> Funcs;
-  std::unordered_map<const BinaryFunction *, NodeId> FuncToNodeId;
+  ArcsType Arcs;
 };

-inline bool NoFilter(const BinaryFunction &) { return false; }
+template<class L>
+void CallGraph::printDot(char* FileName, L GetLabel) const {
+  FILE* File = fopen(FileName, "wt");
+  if (!File) return;

-/// Builds a call graph from the map of BinaryFunctions provided in BFs.
-/// The arguments control how the graph is constructed.
-/// Filter is called on each function, any function that it returns true for
-/// is omitted from the graph.
-/// If IncludeColdCalls is true, then calls from cold BBs are considered for the
-/// graph, otherwise they are ignored.
-/// UseFunctionHotSize controls whether the hot size of a function is used when
-/// filling in the Size attribute of new Nodes.
-/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is
-/// computed using the offsets of call instructions.
-CallGraph buildCallGraph(BinaryContext &BC,
-                         std::map<uint64_t, BinaryFunction> &BFs,
-                         std::function<bool (const BinaryFunction &BF)> Filter = NoFilter,
-                         bool IncludeColdCalls = true,
-                         bool UseFunctionHotSize = false,
-                         bool UseEdgeCounts = false);
+  fprintf(File, "digraph g {\n");
+  for (NodeId F = 0; F < Nodes.size(); F++) {
+    if (Nodes[F].samples() == 0) continue;
+    fprintf(
+            File,
+            "f%lu [label=\"%s\\nsamples=%u\\nsize=%u\"];\n",
+            F,
+            GetLabel(F),
+            Nodes[F].samples(),
+            Nodes[F].size());
+  }
+  for (NodeId F = 0; F < Nodes.size(); F++) {
+    if (Nodes[F].samples() == 0) continue;
+    for (auto Dst : Nodes[F].successors()) {
+      auto Arc = findArc(F, Dst);
+      fprintf(
+              File,
+              "f%lu -> f%u [label=\"normWgt=%.3lf,weight=%.0lf,callOffset=%.1lf\"];"
+              "\n",
+              F,
+              Dst,
+              Arc->normalizedWeight(),
+              Arc->weight(),
+              Arc->avgCallOffset());
+    }
+  }
+  fprintf(File, "}\n");
+  fclose(File);
+}

 } // namespace bolt
 } // namespace llvm
--- a/bolt/Passes/FrameAnalysis.cpp
+++ b/bolt/Passes/FrameAnalysis.cpp
@ -347,8 +347,8 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) {
    }

    if (RegsKilledMap[Func] != RegsKilled || Updated) {
-      for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) {
-        Queue.push(Cg.Funcs[Caller]);
+      for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) {
+        Queue.push(Cg.nodeIdToFunc(Caller));
      }
    }
    RegsKilledMap[Func] = std::move(RegsKilled);
--- a/bolt/Passes/FrameAnalysis.h
+++ b/bolt/Passes/FrameAnalysis.h
@ -13,7 +13,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H

 #include "BinaryPasses.h"
-#include "CallGraph.h"
+#include "BinaryFunctionCallGraph.h"
 #include "StackPointerTracking.h"

 namespace llvm {
@ -113,7 +113,7 @@ raw_ostream &operator<<(raw_ostream &OS,
 ///
 class FrameAnalysis : public BinaryFunctionPass {
  /// Call graph info
-  CallGraph Cg;
+  BinaryFunctionCallGraph Cg;

  /// DFS or reverse post-ordering of the call graph nodes to allow us to
  /// traverse the call graph bottom-up
--- a/bolt/Passes/FrameOptimizer.cpp
+++ b/bolt/Passes/FrameOptimizer.cpp
@ -96,8 +96,8 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) {
    }

    if (RegsKilledMap[Func] != RegsKilled) {
-      for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) {
-        Queue.push(Cg.Funcs[Caller]);
+      for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) {
+        Queue.push(Cg.nodeIdToFunc(Caller));
      }
    }
    RegsKilledMap[Func] = std::move(RegsKilled);
--- a/bolt/Passes/FrameOptimizer.h
+++ b/bolt/Passes/FrameOptimizer.h
@ -13,7 +13,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H

 #include "BinaryPasses.h"
-#include "CallGraph.h"
+#include "BinaryFunctionCallGraph.h"

 namespace llvm {
 namespace bolt {
@ -76,7 +76,7 @@ class FrameOptimizerPass : public BinaryFunctionPass {
  uint64_t CountFunctionsAllClobber{0};

  /// Call graph info
-  CallGraph Cg;
+  BinaryFunctionCallGraph Cg;

  /// DFS or reverse post-ordering of the call graph nodes to allow us to
  /// traverse the call graph bottom-up
--- a/bolt/Passes/HFSort.cpp
+++ b/bolt/Passes/HFSort.cpp
@ -30,13 +30,17 @@
 #include "HFSort.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Options.h"
 #include "llvm/Support/raw_ostream.h"
-#include <set>
-#include <unordered_map>
+#include <unordered_set>

 #undef  DEBUG_TYPE
 #define DEBUG_TYPE "hfsort"

+namespace opts {
+extern llvm::cl::opt<unsigned> Verbosity;
+}
+
 namespace llvm {
 namespace bolt {

@ -65,10 +69,10 @@ constexpr int CallerDegradeFactor = 8;

 Cluster::Cluster(NodeId Id, const Node &Func) {
  Targets.push_back(Id);
-  Size = Func.Size;
-  Samples = Func.Samples;
+  Size = Func.size();
+  Samples = Func.samples();
+  Density = (double)Samples / Size;
  Frozen = false;
-  DEBUG(dbgs() << "new Cluster: " << toString() << "\n");
 }

 std::string Cluster::toString() const {
@ -91,25 +95,31 @@ void freezeClusters(const CallGraph &Cg, std::vector<Cluster> &Clusters) {
  uint32_t TotalSize = 0;
  std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity);
  for (auto &C : Clusters) {
-    uint32_t NewSize = TotalSize + C.Size;
+    uint32_t NewSize = TotalSize + C.size();
    if (NewSize > FrozenPages * HugePageSize) break;
-    C.Frozen = true;
+    C.freeze();
    TotalSize = NewSize;
-    auto Fid = C.Targets[0];
-    DEBUG(dbgs() <<
-          format("freezing cluster for func %d, size = %u, samples = %u)\n",
-                 Fid, Cg.Nodes[Fid].Size, Cg.Nodes[Fid].Samples););
+    DEBUG(
+      auto Fid = C.target(0);
+      dbgs() <<
+          format("freezing cluster for func %d, size = %u, samples = %lu)\n",
+                 Fid, Cg.size(Fid), Cg.samples(Fid)););
  }
 }

 }

+void Cluster::reverseTargets() {
+  std::reverse(Targets.begin(), Targets.end());
+}
+
 void Cluster::merge(Cluster&& Other, const double Aw) {
  Targets.insert(Targets.end(),
                 Other.Targets.begin(),
                 Other.Targets.end());
  Size += Other.Size;
  Samples += Other.Samples;
+  Density = (double)Samples / Size;

  Other.Size = 0;
  Other.Samples = 0;
@ -120,13 +130,13 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
  std::vector<NodeId> SortedFuncs;

  // indexed by NodeId, keeps it's current cluster
-  std::vector<Cluster*> FuncCluster(Cg.Nodes.size(), nullptr);
+  std::vector<Cluster*> FuncCluster(Cg.numNodes(), nullptr);
  std::vector<Cluster> Clusters;
-  Clusters.reserve(Cg.Nodes.size());
+  Clusters.reserve(Cg.numNodes());

-  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
-    if (Cg.Nodes[F].Samples == 0) continue;
-    Clusters.emplace_back(F, Cg.Nodes[F]);
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;
+    Clusters.emplace_back(F, Cg.getNode(F));
    SortedFuncs.push_back(F);
  }

@ -135,18 +145,18 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
  // The size and order of Clusters is fixed until we reshuffle it immediately
  // before returning.
  for (auto &Cluster : Clusters) {
-    FuncCluster[Cluster.Targets.front()] = &Cluster;
+    FuncCluster[Cluster.targets().front()] = &Cluster;
  }

  std::sort(
    SortedFuncs.begin(),
    SortedFuncs.end(),
    [&] (const NodeId F1, const NodeId F2) {
-      const auto &Func1 = Cg.Nodes[F1];
-      const auto &Func2 = Cg.Nodes[F2];
+      const auto &Func1 = Cg.getNode(F1);
+      const auto &Func2 = Cg.getNode(F2);
      return
-        (uint64_t)Func1.Samples * Func2.Size >  // TODO: is this correct?
-        (uint64_t)Func2.Samples * Func1.Size;
+        Func1.samples() * Func2.size() >  // TODO: is this correct?
+        Func2.samples() * Func1.size();
    }
  );

@ -154,17 +164,17 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
  // one containing its most likely predecessor.
  for (const auto Fid : SortedFuncs) {
    auto Cluster = FuncCluster[Fid];
-    if (Cluster->Frozen) continue;
+    if (Cluster->frozen()) continue;

    // Find best predecessor.
    NodeId BestPred = CallGraph::InvalidId;
    double BestProb = 0;

-    for (const auto Src : Cg.Nodes[Fid].Preds) {
-      auto &A = *Cg.Arcs.find(Arc(Src, Fid));
-      if (BestPred == CallGraph::InvalidId || A.NormalizedWeight > BestProb) {
-        BestPred = A.Src;
-        BestProb = A.NormalizedWeight;
+    for (const auto Src : Cg.predecessors(Fid)) {
+      const auto &Arc = *Cg.findArc(Src, Fid);
+      if (BestPred == CallGraph::InvalidId || Arc.normalizedWeight() > BestProb) {
+        BestPred = Arc.src();
+        BestProb = Arc.normalizedWeight();
      }
    }

@ -180,29 +190,32 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
    // Skip if no predCluster (predecessor w/ no samples), or if same
    // as cluster, of it's frozen.
    if (PredCluster == nullptr || PredCluster == Cluster ||
-        PredCluster->Frozen) {
+        PredCluster->frozen()) {
      continue;
    }

    // Skip if merged cluster would be bigger than the threshold.
-    if (Cluster->Size + PredCluster->Size > MaxClusterSize) continue;
+    if (Cluster->size() + PredCluster->size() > MaxClusterSize) continue;

    // Check if the merge is good for the caller.
    //   Don't merge if the caller's density is significantly better
    //   than the density resulting from the merge.
    const double NewDensity =
-      ((double)PredCluster->Samples + Cluster->Samples) /
-      (PredCluster->Size + Cluster->Size);
+      ((double)PredCluster->samples() + Cluster->samples()) /
+      (PredCluster->size() + Cluster->size());
    if (PredCluster->density() > NewDensity * CallerDegradeFactor) {
      continue;
    }

-    DEBUG(dbgs() << format("merging %s -> %s: %u\n",
-                           PredCluster->toString().c_str(),
-                           Cluster->toString().c_str(),
-                           Cg.Nodes[Fid].Samples););
+    DEBUG(
+      if (opts::Verbosity > 1) {
+        dbgs() << format("merging %s -> %s: %u\n",
+                         PredCluster->toString().c_str(),
+                         Cluster->toString().c_str(),
+                         Cg.samples(Fid));
+      });

-    for (auto F : Cluster->Targets) {
+    for (auto F : Cluster->targets()) {
      FuncCluster[F] = PredCluster;
    }

@ -212,12 +225,16 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
  // Return the set of Clusters that are left, which are the ones that
  // didn't get merged (so their first func is its original func).
  std::vector<Cluster> SortedClusters;
+  std::unordered_set<Cluster *> Visited;
  for (const auto Func : SortedFuncs) {
    auto Cluster = FuncCluster[Func];
-    if (!Cluster || Cluster->Targets.empty()) continue;
-    if (Cluster->Targets[0] != Func) continue;
+    if (!Cluster ||
+        Visited.count(Cluster) == 1 ||
+        Cluster->target(0) != Func) {
+      continue;
+    }
    SortedClusters.emplace_back(std::move(*Cluster));
-    Cluster->Targets.clear();
+    Visited.insert(Cluster);
  }

  std::sort(SortedClusters.begin(),
@ -228,32 +245,32 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
 }

 std::vector<Cluster> randomClusters(const CallGraph &Cg) {
-  std::vector<NodeId> FuncIds(Cg.Nodes.size(), 0);
+  std::vector<NodeId> FuncIds(Cg.numNodes(), 0);
  std::vector<Cluster> Clusters;
-  Clusters.reserve(Cg.Nodes.size());  
+  Clusters.reserve(Cg.numNodes());  

-  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
-    if (Cg.Nodes[F].Samples == 0) continue;
-    Clusters.emplace_back(F, Cg.Nodes[F]);
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;
+    Clusters.emplace_back(F, Cg.getNode(F));
  }

  std::sort(Clusters.begin(),
            Clusters.end(),
            [](const Cluster &A, const Cluster &B) {
-              return A.Size < B.Size;
+              return A.size() < B.size();
            });

  auto pickMergeCluster = [&Clusters](const size_t Idx) {
    size_t MaxIdx = Idx + 1;

    while (MaxIdx < Clusters.size() &&
-           Clusters[Idx].Size + Clusters[MaxIdx].Size <= MaxClusterSize) {
+           Clusters[Idx].size() + Clusters[MaxIdx].size() <= MaxClusterSize) {
      ++MaxIdx;
    }

    if (MaxIdx - Idx > 1) {
      size_t MergeIdx = (std::rand() % (MaxIdx - Idx - 1)) + Idx + 1;
-      assert(Clusters[MergeIdx].Size + Clusters[Idx].Size <= MaxClusterSize);
+      assert(Clusters[MergeIdx].size() + Clusters[Idx].size() <= MaxClusterSize);
      return MergeIdx;
    }
    return Clusters.size();
--- a/bolt/Passes/HFSort.h
+++ b/bolt/Passes/HFSort.h
@ -50,15 +50,27 @@ public:
  Cluster(CallGraph::NodeId Id, const CallGraph::Node &F);

  std::string toString() const;
-  double density() const {
-    return (double)Samples / Size;
-  }
-
+  double density() const { return Density; }
+  uint64_t samples() const { return Samples; }
+  uint32_t size() const { return Size; }
+  bool frozen() const { return Frozen; }
+  void freeze() { Frozen = true; }
  void merge(Cluster &&Other, const double Aw = 0);
-
+  size_t numTargets() const {
+    return Targets.size();
+  }
+  const std::vector<CallGraph::NodeId> &targets() const {
+    return Targets;
+  }
+  CallGraph::NodeId target(size_t N) const {
+    return Targets[N];
+  }
+  void reverseTargets();
+private:
  std::vector<CallGraph::NodeId> Targets;
-  uint32_t Samples;
+  uint64_t Samples;
  uint32_t Size;
+  double Density;
  bool Frozen; // not a candidate for merging
 };

--- a/bolt/Passes/HFSortPlus.cpp
+++ b/bolt/Passes/HFSortPlus.cpp
@ -1,4 +1,4 @@
-//===--- HFSort.cpp - Cluster functions by hotness ------------------------===//
+//===--- HFSortPlus.cpp - Cluster functions by hotness --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -144,9 +144,9 @@ void sortByDensity(std::vector<Cluster *> &Clusters) {
      const double D2 = C2->density();
      // making sure the sorting is deterministic
      if (D1 != D2) return D1 > D2;
-      if (C1->Size != C2->Size) return C1->Size < C2->Size;
-      if (C1->Samples != C2->Samples) return C1->Samples > C2->Samples;
-      return C1->Targets[0] < C2->Targets[0];
+      if (C1->size() != C2->size()) return C1->size() < C2->size();
+      if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
+      return C1->target(0) < C2->target(0);
    }
  );
 }
@ -155,8 +155,8 @@ void sortByDensity(std::vector<Cluster *> &Clusters) {
 * Density of a cluster formed by merging a given pair of clusters
 */
 double density(Cluster *ClusterPred, Cluster *ClusterSucc) {
-  const double CombinedSamples = ClusterPred->Samples + ClusterSucc->Samples;
-  const double CombinedSize = ClusterPred->Size + ClusterSucc->Size;
+  const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples();
+  const double CombinedSize = ClusterPred->size() + ClusterSucc->size();
  return CombinedSamples / CombinedSize;
 }

@ -199,42 +199,42 @@ double expectedCacheHitRatio(const AlgoState &State,
  sortByDensity(Clusters);

  // generate function addresses with an alignment
-  std::vector<size_t> Addr(State.Cg->Nodes.size(), InvalidAddr);
+  std::vector<size_t> Addr(State.Cg->numNodes(), InvalidAddr);
  size_t CurAddr = 0;
  // 'hotness' of the pages
  std::vector<double> PageSamples;
  for (auto Cluster : Clusters) {
-    for (auto TargetId : Cluster->Targets) {
+    for (auto TargetId : Cluster->targets()) {
      if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16;
      Addr[TargetId] = CurAddr;
-      CurAddr += State.Cg->Nodes[TargetId].Size;
+      CurAddr += State.Cg->size(TargetId);
      // update page weight
      size_t Page = Addr[TargetId] / PageSize;
      while (PageSamples.size() <= Page) PageSamples.push_back(0.0);
-      PageSamples[Page] += State.Cg->Nodes[TargetId].Samples;
+      PageSamples[Page] += State.Cg->samples(TargetId);
    }
  }

  // computing expected number of misses for every function
  double Misses = 0;
  for (auto Cluster : Clusters) {
-    for (auto TargetId : Cluster->Targets) {
+    for (auto TargetId : Cluster->targets()) {
      size_t Page = Addr[TargetId] / PageSize;
-      double Samples = State.Cg->Nodes[TargetId].Samples;
+      double Samples = State.Cg->samples(TargetId);
      // probability that the page is not present in the cache
      double MissProb = missProbability(State, PageSamples[Page]);

-      for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
-        if (State.Cg->Nodes[Pred].Samples == 0) continue;
-        auto A = State.Cg->Arcs.find(Arc(Pred, TargetId));
+      for (auto Pred : State.Cg->predecessors(TargetId)) {
+        if (State.Cg->samples(Pred) == 0) continue;
+        const auto &Arc = *State.Cg->findArc(Pred, TargetId);

        // the source page
-        size_t SrcPage = (Addr[Pred] + (size_t)A->AvgCallOffset) / PageSize;
+        size_t SrcPage = (Addr[Pred] + (size_t)Arc.avgCallOffset()) / PageSize;
        if (Page != SrcPage) {
          // this is a miss
-          Misses += A->Weight * MissProb;
+          Misses += Arc.weight() * MissProb;
        }
-        Samples -= A->Weight;
+        Samples -= Arc.weight();
      }

      // the remaining samples come from the jitted code
@ -251,14 +251,14 @@ double expectedCacheHitRatio(const AlgoState &State,
 std::unordered_set<Cluster *> adjacentClusters(const AlgoState &State,
                                              Cluster *C) {
  std::unordered_set<Cluster *> Result;
-  for (auto TargetId : C->Targets) {
-    for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
+  for (auto TargetId : C->targets()) {
+    for (auto Succ : State.Cg->successors(TargetId)) {
      auto SuccCluster = State.FuncCluster[Succ];
      if (SuccCluster != nullptr && SuccCluster != C) {
        Result.insert(SuccCluster);
      }
    }
-    for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
+    for (auto Pred : State.Cg->predecessors(TargetId)) {
      auto PredCluster = State.FuncCluster[Pred];
      if (PredCluster != nullptr && PredCluster != C) {
        Result.insert(PredCluster);
@ -285,15 +285,15 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) {
 */
 double shortCalls(const AlgoState &State, Cluster *Cluster) {
  double Calls = 0;
-  for (auto TargetId : Cluster->Targets) {
-    for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
+  for (auto TargetId : Cluster->targets()) {
+    for (auto Succ : State.Cg->successors(TargetId)) {
      if (State.FuncCluster[Succ] == Cluster) {
-        auto A = State.Cg->Arcs.find(Arc(TargetId, Succ));
+        const auto &Arc = *State.Cg->findArc(TargetId, Succ);

-        auto SrcAddr = State.Addr[TargetId] + A->AvgCallOffset;
+        auto SrcAddr = State.Addr[TargetId] + Arc.avgCallOffset();
        auto DstAddr = State.Addr[Succ];

-        Calls += expectedCalls(SrcAddr, DstAddr, A->Weight);
+        Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight());
      }
    }
  }
@ -309,29 +309,29 @@ double shortCalls(const AlgoState &State,
                  Cluster *ClusterPred,
                  Cluster *ClusterSucc) {
  double Calls = 0;
-  for (auto TargetId : ClusterPred->Targets) {
-    for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
+  for (auto TargetId : ClusterPred->targets()) {
+    for (auto Succ : State.Cg->successors(TargetId)) {
      if (State.FuncCluster[Succ] == ClusterSucc) {
-        auto A = State.Cg->Arcs.find(Arc(TargetId, Succ));
+        const auto &Arc = *State.Cg->findArc(TargetId, Succ);

-        auto SrcAddr = State.Addr[TargetId] + A->AvgCallOffset;
-        auto DstAddr = State.Addr[Succ] + ClusterPred->Size;
+        auto SrcAddr = State.Addr[TargetId] + Arc.avgCallOffset();
+        auto DstAddr = State.Addr[Succ] + ClusterPred->size();

-        Calls += expectedCalls(SrcAddr, DstAddr, A->Weight);
+        Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight());
      }
    }
  }

-  for (auto TargetId : ClusterPred->Targets) {
-    for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
+  for (auto TargetId : ClusterPred->targets()) {
+    for (auto Pred : State.Cg->predecessors(TargetId)) {
      if (State.FuncCluster[Pred] == ClusterSucc) {
-        auto A = State.Cg->Arcs.find(Arc(Pred, TargetId));
+        const auto &Arc = *State.Cg->findArc(Pred, TargetId);

-        auto SrcAddr = State.Addr[Pred] + A->AvgCallOffset +
-          ClusterPred->Size;
+        auto SrcAddr = State.Addr[Pred] + Arc.avgCallOffset() +
+          ClusterPred->size();
        auto DstAddr = State.Addr[TargetId];

-        Calls += expectedCalls(SrcAddr, DstAddr, A->Weight);
+        Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight());
      }
    }
  }
@ -355,12 +355,12 @@ double mergeGain(const AlgoState &State,
                 Cluster *ClusterPred,
                 Cluster *ClusterSucc) {
  // cache misses on the first cluster
-  double LongCallsPred = ClusterPred->Samples - shortCalls(State, ClusterPred);
+  double LongCallsPred = ClusterPred->samples() - shortCalls(State, ClusterPred);
  double ProbPred = missProbability(State, ClusterPred->density() * PageSize);
  double ExpectedMissesPred = LongCallsPred * ProbPred;

  // cache misses on the second cluster
-  double LongCallsSucc = ClusterSucc->Samples - shortCalls(State, ClusterSucc);
+  double LongCallsSucc = ClusterSucc->samples() - shortCalls(State, ClusterSucc);
  double ProbSucc = missProbability(State, ClusterSucc->density() * PageSize);
  double ExpectedMissesSucc = LongCallsSucc * ProbSucc;

@ -373,28 +373,7 @@ double mergeGain(const AlgoState &State,

  double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew;
  // scaling the result to increase the importance of merging short clusters
-  return Gain / (ClusterPred->Size + ClusterSucc->Size);
-}
-
- /*
-  * Merge two clusters
-  */
-void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) {
-  auto &Targets = Other->Targets;
-  Into->Targets.insert(Into->Targets.end(), Targets.begin(), Targets.end());
-  Into->Size += Other->Size;
-  Into->Samples += Other->Samples;
-
-  size_t CurAddr = 0;
-  for (auto TargetId : Into->Targets) {
-    State.FuncCluster[TargetId] = Into;
-    State.Addr[TargetId] = CurAddr;
-    CurAddr += State.Cg->Nodes[TargetId].Size;
-  }
-
-  Other->Size = 0;
-  Other->Samples = 0;
-  Other->Targets.clear();
+  return Gain / (ClusterPred->size() + ClusterSucc->size());
 }

 /*
@ -403,26 +382,26 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) {
 std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
  // create a cluster for every function
  std::vector<Cluster> AllClusters;
-  AllClusters.reserve(Cg.Nodes.size());
-  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
-    AllClusters.emplace_back(F, Cg.Nodes[F]);
+  AllClusters.reserve(Cg.numNodes());
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    AllClusters.emplace_back(F, Cg.getNode(F));
  }

  // initialize objects used by the algorithm
  std::vector<Cluster *> Clusters;
-  Clusters.reserve(Cg.Nodes.size());
+  Clusters.reserve(Cg.numNodes());
  AlgoState State;
  State.Cg = &Cg;
  State.TotalSamples = 0;
-  State.FuncCluster = std::vector<Cluster *>(Cg.Nodes.size(), nullptr);
-  State.Addr = std::vector<size_t>(Cg.Nodes.size(), InvalidAddr);
-  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
-    if (Cg.Nodes[F].Samples == 0) continue;
+  State.FuncCluster = std::vector<Cluster *>(Cg.numNodes(), nullptr);
+  State.Addr = std::vector<size_t>(Cg.numNodes(), InvalidAddr);
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;

    Clusters.push_back(&AllClusters[F]);
    State.FuncCluster[F] = &AllClusters[F];
    State.Addr[F] = 0;
-    State.TotalSamples += Cg.Nodes[F].Samples;
+    State.TotalSamples += Cg.samples(F);
  }

  DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n"
@ -482,7 +461,15 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
    Cache.invalidate(BestClusterSucc);

    // merge the best pair of clusters
-    mergeInto(State, BestClusterPred, BestClusterSucc);
+    BestClusterPred->merge(std::move(*BestClusterSucc));
+
+    size_t CurAddr = 0;
+    for (auto TargetId : BestClusterPred->targets()) {
+      State.FuncCluster[TargetId] = BestClusterPred;
+      State.Addr[TargetId] = CurAddr;
+      CurAddr += State.Cg->size(TargetId);
+    }
+
    // remove BestClusterSucc from the list of active clusters
    auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc);
    Clusters.erase(Iter, Clusters.end());
--- a/bolt/Passes/PettisAndHansen.cpp
+++ b/bolt/Passes/PettisAndHansen.cpp
@ -44,29 +44,29 @@ public:
 using ClusterArcSet = std::unordered_set<ClusterArc, ClusterArcHash>;

 void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) {
-  auto C1head = C1->Targets.front();
-  auto C1tail = C1->Targets.back();
-  auto C2head = C2->Targets.front();
-  auto C2tail = C2->Targets.back();
+  auto C1head = C1->targets().front();
+  auto C1tail = C1->targets().back();
+  auto C2head = C2->targets().front();
+  auto C2tail = C2->targets().back();

  double C1headC2head = 0;
  double C1headC2tail = 0;
  double C1tailC2head = 0;
  double C1tailC2tail = 0;

-  for (const auto &Arc : Cg.Arcs) {
-    if ((Arc.Src == C1head && Arc.Dst == C2head) ||
-        (Arc.Dst == C1head && Arc.Src == C2head)) {
-      C1headC2head += Arc.Weight;
-    } else if ((Arc.Src == C1head && Arc.Dst == C2tail) ||
-               (Arc.Dst == C1head && Arc.Src == C2tail)) {
-      C1headC2tail += Arc.Weight;
-    } else if ((Arc.Src == C1tail && Arc.Dst == C2head) ||
-               (Arc.Dst == C1tail && Arc.Src == C2head)) {
-      C1tailC2head += Arc.Weight;
-    } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) ||
-               (Arc.Dst == C1tail && Arc.Src == C2tail)) {
-      C1tailC2tail += Arc.Weight;
+  for (const auto &Arc : Cg.getArcs()) {
+    if ((Arc.src() == C1head && Arc.dst() == C2head) ||
+        (Arc.dst() == C1head && Arc.src() == C2head)) {
+      C1headC2head += Arc.weight();
+    } else if ((Arc.src() == C1head && Arc.dst() == C2tail) ||
+               (Arc.dst() == C1head && Arc.src() == C2tail)) {
+      C1headC2tail += Arc.weight();
+    } else if ((Arc.src() == C1tail && Arc.dst() == C2head) ||
+               (Arc.dst() == C1tail && Arc.src() == C2head)) {
+      C1tailC2head += Arc.weight();
+    } else if ((Arc.src() == C1tail && Arc.dst() == C2tail) ||
+               (Arc.dst() == C1tail && Arc.src() == C2tail)) {
+      C1tailC2tail += Arc.weight();
    }
  }

@ -75,29 +75,29 @@ void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) {

  if (C1headC2head == Max) {
    // flip C1
-    std::reverse(C1->Targets.begin(), C1->Targets.end());
+    C1->reverseTargets();
  } else if (C1headC2tail == Max) {
    // flip C1 C2
-    std::reverse(C1->Targets.begin(), C1->Targets.end());
-    std::reverse(C2->Targets.begin(), C2->Targets.end());
+    C1->reverseTargets();
+    C2->reverseTargets();
  } else if (C1tailC2tail == Max) {
    // flip C2
-    std::reverse(C2->Targets.begin(), C2->Targets.end());
+    C2->reverseTargets();
  }
 }
 }

 std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {
  // indexed by NodeId, keeps its current cluster
-  std::vector<Cluster*> FuncCluster(Cg.Nodes.size(), nullptr);
+  std::vector<Cluster*> FuncCluster(Cg.numNodes(), nullptr);
  std::vector<Cluster> Clusters;
  std::vector<NodeId> Funcs;

-  Clusters.reserve(Cg.Nodes.size());
+  Clusters.reserve(Cg.numNodes());

-  for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
-    if (Cg.Nodes[F].Samples == 0) continue;
-    Clusters.emplace_back(F, Cg.Nodes[F]);
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;
+    Clusters.emplace_back(F, Cg.getNode(F));
    FuncCluster[F] = &Clusters.back();
    Funcs.push_back(F);
  }
@ -113,11 +113,11 @@ std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {

  // Create a std::vector of cluster arcs

-  for (auto &Arc : Cg.Arcs) {
-    if (Arc.Weight == 0) continue;
+  for (auto &Arc : Cg.getArcs()) {
+    if (Arc.weight() == 0) continue;

-    auto const S = FuncCluster[Arc.Src];
-    auto const D = FuncCluster[Arc.Dst];
+    auto const S = FuncCluster[Arc.src()];
+    auto const D = FuncCluster[Arc.dst()];

    // ignore if s or d is nullptr

@ -127,7 +127,7 @@ std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {

    if (S == D) continue;

-    insertOrInc(S, D, Arc.Weight);
+    insertOrInc(S, D, Arc.weight());
  }

  // Find an arc with max weight and merge its nodes
@ -147,9 +147,9 @@ std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {
    auto const C1 = Max.C1;
    auto const C2 = Max.C2;

-    if (C1->Size + C2->Size > MaxClusterSize) continue;
+    if (C1->size() + C2->size() > MaxClusterSize) continue;

-    if (C1->Frozen || C2->Frozen) continue;
+    if (C1->frozen() || C2->frozen()) continue;

    // order functions and merge cluster

@ -176,7 +176,7 @@ std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {

    // update FuncCluster

-    for (auto F : C2->Targets) {
+    for (auto F : C2->targets()) {
      FuncCluster[F] = C1;
    }
    C1->merge(std::move(*C2), Max.Weight);
--- a/bolt/Passes/ReorderFunctions.cpp
+++ b/bolt/Passes/ReorderFunctions.cpp
@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "ReorderFunctions.h"
+#include "HFSort.h"
 #include "llvm/Support/Options.h"
 #include <fstream>

@ -90,42 +91,19 @@ using NodeId = CallGraph::NodeId;
 using Arc = CallGraph::Arc;
 using Node = CallGraph::Node;  

-void ReorderFunctions::normalizeArcWeights() {
-  // Normalize arc weights.
-  if (!opts::UseEdgeCounts) {
-    for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) {
-      auto& Func = Cg.Nodes[FuncId];
-      for (auto Caller : Func.Preds) {
-        auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
-        A.NormalizedWeight = A.Weight / Func.Samples;
-        A.AvgCallOffset /= A.Weight;
-        assert(A.AvgCallOffset < Cg.Nodes[Caller].Size);
-      }
-    }
-  } else {
-    for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) {
-      auto &Func = Cg.Nodes[FuncId];
-      for (auto Caller : Func.Preds) {
-        auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
-        A.NormalizedWeight = A.Weight / Func.Samples;
-      }
-    }
-  }
-}
-
 void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
                               std::map<uint64_t, BinaryFunction> &BFs) {
-  std::vector<uint64_t> FuncAddr(Cg.Nodes.size());  // Just for computing stats
+  std::vector<uint64_t> FuncAddr(Cg.numNodes());  // Just for computing stats
  uint64_t TotalSize = 0;
  uint32_t Index = 0;

  // Set order of hot functions based on clusters.
  for (const auto& Cluster : Clusters) {
-    for (const auto FuncId : Cluster.Targets) {
-      assert(Cg.Nodes[FuncId].Samples > 0);
-      Cg.Funcs[FuncId]->setIndex(Index++);
+    for (const auto FuncId : Cluster.targets()) {
+      assert(Cg.samples(FuncId) > 0);
+      Cg.nodeIdToFunc(FuncId)->setIndex(Index++);
      FuncAddr[FuncId] = TotalSize;
-      TotalSize += Cg.Nodes[FuncId].Size;
+      TotalSize += Cg.size(FuncId);
    }
  }

@ -141,6 +119,11 @@ void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
 #endif
  }

+  bool PrintDetailed = opts::Verbosity > 1;
+#ifndef NDEBUG
+  PrintDetailed |=
+    (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0);
+#endif
  TotalSize   = 0;
  uint64_t CurPage     = 0;
  uint64_t Hotfuncs    = 0;
@ -149,65 +132,84 @@ void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
  double TotalCalls64B = 0;
  double TotalCalls4KB = 0;
  double TotalCalls2MB = 0;
-  dbgs() << "============== page 0 ==============\n";
+  if (PrintDetailed) {
+    outs() << "BOLT-INFO: Function reordering page layout\n"
+           << "BOLT-INFO: ============== page 0 ==============\n";
+  }
  for (auto& Cluster : Clusters) {
-    dbgs() <<
-      format("-------- density = %.3lf (%u / %u) --------\n",
-             (double) Cluster.Samples / Cluster.Size,
-             Cluster.Samples, Cluster.Size);
+    if (PrintDetailed) {
+      outs() <<
+        format("BOLT-INFO: -------- density = %.3lf (%u / %u) --------\n",
+               Cluster.density(), Cluster.samples(), Cluster.size());
+    }

-    for (auto FuncId : Cluster.Targets) {
-      if (Cg.Nodes[FuncId].Samples > 0) {
+    for (auto FuncId : Cluster.targets()) {
+      if (Cg.samples(FuncId) > 0) {
        Hotfuncs++;

-        dbgs() << "BOLT-INFO: hot func " << *Cg.Funcs[FuncId]
-               << " (" << Cg.Nodes[FuncId].Size << ")\n";
+        if (PrintDetailed) {
+          outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId)
+                 << " (" << Cg.size(FuncId) << ")\n";
+        }

        uint64_t Dist = 0;
        uint64_t Calls = 0;
-        for (auto Dst : Cg.Nodes[FuncId].Succs) {
-          auto& A = *Cg.Arcs.find(Arc(FuncId, Dst));
-          auto D =
-            std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset));
-          auto W = A.Weight;
+        for (auto Dst : Cg.successors(FuncId)) {
+          const auto& Arc = *Cg.findArc(FuncId, Dst);
+          const auto D = std::abs(FuncAddr[Arc.dst()] -
+                                  (FuncAddr[FuncId] + Arc.avgCallOffset()));
+          const auto W = Arc.weight();
          Calls += W;
          if (D < 64)        TotalCalls64B += W;
          if (D < 4096)      TotalCalls4KB += W;
          if (D < (2 << 20)) TotalCalls2MB += W;
-          Dist += A.Weight * D;
-          dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
-                           "weight = %.0lf, callDist = %f\n",
-                           A.Src, FuncAddr[A.Src], A.AvgCallOffset,
-                           A.Dst, FuncAddr[A.Dst], A.Weight, D);
+          Dist += Arc.weight() * D;
+          if (PrintDetailed) {
+            outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
+                             "weight = %.0lf, callDist = %f\n",
+                             Arc.src(),
+                             FuncAddr[Arc.src()],
+                             Arc.avgCallOffset(),
+                             Arc.dst(),
+                             FuncAddr[Arc.dst()],
+                             Arc.weight(), D);
+          }
        }
        TotalCalls += Calls;
        TotalDistance += Dist;
-        dbgs() << format("start = %6u : avgCallDist = %lu : %s\n",
-                         TotalSize,
-                         Calls ? Dist / Calls : 0,
-                         Cg.Funcs[FuncId]->getPrintName().c_str());
-        TotalSize += Cg.Nodes[FuncId].Size;
-        auto NewPage = TotalSize / HugePageSize;
-        if (NewPage != CurPage) {
-          CurPage = NewPage;
-          dbgs() << format("============== page %u ==============\n", CurPage);
+        TotalSize += Cg.size(FuncId);
+
+        if (PrintDetailed) {
+          outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : %s\n",
+                           TotalSize,
+                           Calls ? Dist / Calls : 0,
+                           Cg.nodeIdToFunc(FuncId)->getPrintName().c_str());
+          const auto NewPage = TotalSize / HugePageSize;
+          if (NewPage != CurPage) {
+            CurPage = NewPage;
+            outs() <<
+              format("BOLT-INFO: ============== page %u ==============\n",
+                     CurPage);
+          }
        }
      }
    }
  }
-  dbgs() << format("  Number of hot functions: %u\n"
-                   "  Number of clusters: %lu\n",
+  outs() << "BOLT-INFO: Function reordering stats\n"
+         << format("BOLT-INFO:  Number of hot functions: %u\n"
+                   "BOLT-INFO:  Number of clusters: %lu\n",
                   Hotfuncs, Clusters.size())
-         << format("  Final average call distance = %.1lf (%.0lf / %.0lf)\n",
+         << format("BOLT-INFO:  Final average call distance = %.1lf "
+                   "(%.0lf / %.0lf)\n",
                   TotalCalls ? TotalDistance / TotalCalls : 0,
                   TotalDistance, TotalCalls)
-         << format("  Total Calls = %.0lf\n", TotalCalls);
+         << format("BOLT-INFO:  Total Calls = %.0lf\n", TotalCalls);
  if (TotalCalls) {
-    dbgs() << format("  Total Calls within 64B = %.0lf (%.2lf%%)\n",
+    outs() << format("BOLT-INFO:  Total Calls within 64B = %.0lf (%.2lf%%)\n",
                     TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
-           << format("  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
+           << format("BOLT-INFO:  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
                     TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
-           << format("  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
+           << format("BOLT-INFO:  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
                     TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
  }
 }
@ -251,7 +253,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC,
                        false, // IncludeColdCalls
                        opts::ReorderFunctionsUseHotSize,
                        opts::UseEdgeCounts);
-    normalizeArcWeights();
+    Cg.normalizeArcWeights(opts::UseEdgeCounts);
  }

  std::vector<Cluster> Clusters;
--- a/bolt/Passes/ReorderFunctions.h
+++ b/bolt/Passes/ReorderFunctions.h
@ -13,16 +13,15 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H

 #include "BinaryPasses.h"
-#include "HFSort.h"
+#include "BinaryFunctionCallGraph.h"

 namespace llvm {
 namespace bolt {

 /// Modify function order for streaming based on hotness.
 class ReorderFunctions : public BinaryFunctionPass {
-  CallGraph Cg;
+  BinaryFunctionCallGraph Cg;

-  void normalizeArcWeights();
  void reorder(std::vector<Cluster> &&Clusters,
               std::map<uint64_t, BinaryFunction> &BFs);
 public:
--- a/bolt/RewriteInstance.cpp
+++ b/bolt/RewriteInstance.cpp
@ -1534,14 +1534,14 @@ void RewriteInstance::readRelocations(const SectionRef &Section) {
        Rel.getType() != ELF::R_X86_64_GOTTPOFF &&
        Rel.getType() != ELF::R_X86_64_GOTPCREL) {
      if (!IsPCRelative) {
-        if (opts::Verbosity > 1 &&
+        if (opts::Verbosity > 2 &&
            ExtractedValue != Address) {
          errs() << "BOLT-WARNING: mismatch ExtractedValue = 0x"
                 << Twine::utohexstr(ExtractedValue) << '\n';
        }
        Address = ExtractedValue;
      } else {
-        if (opts::Verbosity > 1 &&
+        if (opts::Verbosity > 2 &&
            ExtractedValue != Address - Rel.getOffset() + Addend) {
          errs() << "BOLT-WARNING: PC-relative mismatch ExtractedValue = 0x"
                 << Twine::utohexstr(ExtractedValue) << '\n';