[BOLT] Fix hfsort+ caching mechanism

Summary: There's good news and bad news. The good news is that this fixes the caching mechanism used by hfsort+ so that we always get the correct end results, i.e. the order is the same whether the cache is enabled or not. The bad news is that it takes about the same amount of time as the original to run. (~6min) The good news is that I can make some improvements on this implementation which I'll put up in another diff. The problem with the old caching mechanism is that it was caching values that were dependent on adjacent sets of clusters. It only invalidated the clusters being merged and none of other clusters that might have been affected. This version computes the adjacency information up front and updates it after every merge, rather than recomputing it for each iteration. It uses the adjacency data to properly invalidate any cached values. (cherry picked from FBD5203023)
2017-06-06 17:43:45 -07:00 · 2017-06-06 17:43:45 -07:00 · ea53066287
parent 583790ee22
commit ea53066287
7 changed files with 338 additions and 163 deletions
--- a/bolt/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/Passes/BinaryFunctionCallGraph.cpp
@ -12,13 +12,14 @@
 #include "BinaryFunctionCallGraph.h"
 #include "BinaryFunction.h"
 #include "BinaryContext.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Options.h"
 #include "llvm/Support/Timer.h"

 #define DEBUG_TYPE "callgraph"

 namespace opts {
 extern llvm::cl::opt<bool> TimeOpts;
+extern llvm::cl::opt<unsigned> Verbosity;
 }

 namespace llvm {
@ -130,8 +131,11 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
        const auto DstId = lookupNode(DstFunc);
        const auto AvgDelta = !UseEdgeCounts ? Offset - DstFunc->getAddress() : 0;
        Cg.incArcWeight(SrcId, DstId, Count, AvgDelta);
-        DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
-              << " -> " << *DstFunc << " @ " << Offset << "\n");
+        DEBUG(
+          if (opts::Verbosity > 1) {
+            dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
+                   << " -> " << *DstFunc << " @ " << Offset << "\n";
+          });
        return true;
      }
      return false;
@ -194,8 +198,16 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
    }
  }

-  outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed
-         << " callsites not processed out of " << TotalCalls << "\n";
+#ifndef NDEBUG
+  bool PrintInfo = DebugFlag && isCurrentDebugType("callgraph");
+#else
+  bool PrintInfo = false;
+#endif
+  if (PrintInfo || opts::Verbosity > 0) {
+    outs() << format("BOLT-INFO: buildCallGraph: %u nodes, density = %.6lf, "
+                     "%u callsites not processed out of %u.\n",
+                     Cg.numNodes(), Cg.density(), NotProcessed, TotalCalls);
+  }

  return Cg;
 }
--- a/bolt/Passes/CallGraph.cpp
+++ b/bolt/Passes/CallGraph.cpp
@ -10,8 +10,6 @@
 //===----------------------------------------------------------------------===//

 #include "CallGraph.h"
-#include "BinaryFunction.h"
-#include "BinaryContext.h"

 #define DEBUG_TYPE "callgraph"

--- a/bolt/Passes/CallGraph.h
+++ b/bolt/Passes/CallGraph.h
@ -130,6 +130,10 @@ public:
    return Arcs;
  }

+  double density() const {
+    return double(Arcs.size()) / (Nodes.size()*Nodes.size());
+  }
+
  void normalizeArcWeights(bool UseEdgeCounts);

  template <typename L>
--- a/bolt/Passes/HFSort.cpp
+++ b/bolt/Passes/HFSort.cpp
@ -112,17 +112,22 @@ void Cluster::reverseTargets() {
  std::reverse(Targets.begin(), Targets.end());
 }

-void Cluster::merge(Cluster&& Other, const double Aw) {
+void Cluster::merge(const Cluster& Other, const double Aw) {
  Targets.insert(Targets.end(),
                 Other.Targets.begin(),
                 Other.Targets.end());
  Size += Other.Size;
  Samples += Other.Samples;
  Density = (double)Samples / Size;
+}

-  Other.Size = 0;
-  Other.Samples = 0;
-  Other.Targets.clear();
+void Cluster::clear() {
+  Id = -1u;
+  Size = 0;
+  Samples = 0;
+  Density = 0.0;
+  Targets.clear();
+  Frozen = false;
 }

 std::vector<Cluster> clusterize(const CallGraph &Cg) {
@ -218,7 +223,8 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
      FuncCluster[F] = PredCluster;
    }

-    PredCluster->merge(std::move(*Cluster));
+    PredCluster->merge(*Cluster);
+    Cluster->clear();
  }

  // Return the set of Clusters that are left, which are the ones that
@ -281,7 +287,7 @@ std::vector<Cluster> randomClusters(const CallGraph &Cg) {
    if (MergeIdx == Clusters.size()) {
      ++Idx;
    } else {
-      Clusters[Idx].merge(std::move(Clusters[MergeIdx]));
+      Clusters[Idx].merge(Clusters[MergeIdx]);
      Clusters.erase(Clusters.begin() + MergeIdx);
    }
  }
--- a/bolt/Passes/HFSort.h
+++ b/bolt/Passes/HFSort.h
@ -55,7 +55,8 @@ public:
  uint32_t size() const { return Size; }
  bool frozen() const { return Frozen; }
  void freeze() { Frozen = true; }
-  void merge(Cluster &&Other, const double Aw = 0);
+  void merge(const Cluster &Other, const double Aw = 0);
+  void clear();
  size_t numTargets() const {
    return Targets.size();
  }
@ -66,12 +67,13 @@ public:
    return Targets[N];
  }
  void reverseTargets();
+  bool hasId() const { return Id != -1u; }
  void setId(uint32_t NewId) {
-    assert(Id == -1u);
+    assert(!hasId());
    Id = NewId;
  }
  uint32_t id() const {
-    assert(Id != -1u);
+    assert(hasId());
    return Id;
  }
 private:
--- a/bolt/Passes/HFSortPlus.cpp
+++ b/bolt/Passes/HFSortPlus.cpp
@ -28,18 +28,53 @@
 */

 #include "HFSort.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Options.h"
+#include "llvm/Support/raw_ostream.h"

 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"

 #undef  DEBUG_TYPE
 #define DEBUG_TYPE "hfsort"

+namespace opts {
+
+extern llvm::cl::OptionCategory BoltOptCategory;
+extern llvm::cl::opt<bool> Verbosity;
+
+static llvm::cl::opt<bool>
+UseGainCache("hfsort+-use-cache",
+  llvm::cl::desc("Use a cache for mergeGain results when computing hfsort+."),
+  llvm::cl::ZeroOrMore,
+  llvm::cl::init(true),
+  llvm::cl::Hidden,
+  llvm::cl::cat(BoltOptCategory));
+
+static llvm::cl::opt<bool>
+UseShortCallCache("hfsort+-use-short-call-cache",
+  llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."),
+  llvm::cl::ZeroOrMore,
+  llvm::cl::init(true),
+  llvm::cl::Hidden,
+  llvm::cl::cat(BoltOptCategory));
+
+const char* cacheKindString() {
+  if (opts::UseGainCache && opts::UseShortCallCache)
+    return "gain + short call cache";
+  else if (opts::UseGainCache)
+    return "gain cache";
+  else if (opts::UseShortCallCache)
+    return "short call cache";
+  else
+    return "no cache";
+}
+
+}
+
 namespace llvm {
 namespace bolt {

@ -60,66 +95,136 @@ constexpr uint32_t ITLBEntries = 16;

 constexpr size_t InvalidAddr = -1;

-template <typename A, typename B>
-class HashPair {
+// This class maintains adjacency information for all Clusters being
+// processed.  It is used to invalidate cache entries when merging
+// Clusters and for visiting all neighbors of any given Cluster.
+class AdjacencyMatrix {
 public:
-  size_t operator()(const std::pair<A, B> &P) const {
-    size_t Seed(0);
-    Seed = hashCombine(Seed, (int64_t)P.first);
-    Seed = hashCombine(Seed, (int64_t)P.second);
-    return Seed;
+  AdjacencyMatrix(const CallGraph &Cg,
+                  std::vector<Cluster *> &Clusters,
+                  const std::vector<Cluster *> &FuncCluster)
+  : Clusters(Clusters),
+    Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) {
+    initialize(Cg, FuncCluster);
  }
+
+  template <typename F>
+  void forallAdjacent(const Cluster *C, F Func) const {
+    const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
+  }
+
+  template <typename F>
+  void forallAdjacent(const Cluster *C, F Func) {
+    for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) {
+      Func(Clusters[I]);
+    }
+  }
+
+  void merge(const Cluster *A, const Cluster *B) {
+    Bits[A->id()] |= Bits[B->id()];
+    Bits[A->id()][A->id()] = false;
+    Bits[A->id()][B->id()] = false;
+    Bits[B->id()][A->id()] = false;
+    for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) {
+      Bits[I][A->id()] = true;
+      Bits[I][B->id()] = false;
+    }
+  }
+
+  void dump(const Cluster *A) const {
+    outs() << "Cluster " << A->id() << ":";
+    forallAdjacent(A,
+                   [this,A](const Cluster *B) {
+                     outs() << " " << B->id();
+                   });
+  }
+
+  void dump() const {
+    for (auto *A : Clusters) {
+      if (!A) continue;
+      dump(A);
+      outs() << "\n";
+    }
+  }
+private:
+  void set(const Cluster *A, const Cluster *B, bool Value) {
+    assert(A != B);
+    Bits[A->id()][B->id()] = Value;
+    Bits[B->id()][A->id()] = Value;
+  }
+
+  void initialize(const CallGraph &Cg, const std::vector<Cluster *> &FuncCluster) {
+    for (auto *A : Clusters) {
+      for (auto TargetId : A->targets()) {
+        for (auto Succ : Cg.successors(TargetId)) {
+          auto *B = FuncCluster[Succ];
+          if (!B || B == A) continue;
+          set(A, B, true);
+        }
+        for (auto Pred : Cg.predecessors(TargetId)) {
+          auto *B = FuncCluster[Pred];
+          if (!B || B == A) continue;
+          set(A, B, true);
+        }
+      }
+    }
+  }
+
+  std::vector<Cluster *> Clusters;
+  std::vector<BitVector> Bits;
 };

 // A cache of precomputed results for a pair of clusters
 class PrecomputedResults {
 public:
-  PrecomputedResults() {}
-
-  bool contains(Cluster *First, Cluster *Second) const {
-    if (InvalidKeys.count(First) || InvalidKeys.count(Second)) {
-      return false;
-    }
-    const auto Key = std::make_pair(First, Second);
-    return Cache.find(Key) != Cache.end();
+  explicit PrecomputedResults(size_t Size)
+  : Size(Size),
+    Cache(new double[Size*Size]),
+    Valid(Size * Size, false) {
+    memset(Cache, 0, sizeof(double)*Size*Size);
+  }
+  ~PrecomputedResults() {
+    delete[] Cache;
  }

-  double get(Cluster *First, Cluster *Second) const {
+  bool contains(const Cluster *First, const Cluster *Second) const {
+    return Valid[index(First, Second)];
+  }
+
+  double get(const Cluster *First, const Cluster *Second) const {
    assert(contains(First, Second));
-    const auto Key = std::make_pair(First, Second); // TODO: use min/max?
-    return Cache.find(Key)->second;
+    return Cache[index(First, Second)];
  }

-  void set(Cluster *First, Cluster *Second, double Value) {
-    const auto Key = std::make_pair(First, Second);
-    Cache[Key] = Value;
-    validate(First);
-    validate(Second);
+  void set(const Cluster *First, const Cluster *Second, double Value) {
+    const auto Index = index(First, Second);
+    Cache[Index] = Value;
+    Valid[Index] = true;
  }

-  void validate(Cluster *C) {
-    auto Itr = InvalidKeys.find(C);
-    if (Itr != InvalidKeys.end())
-      InvalidKeys.erase(Itr);
+  void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) {
+    invalidate(C);
+    Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); });
  }
-
-  void validateAll() {
-    InvalidKeys.clear();
-  }
-
-  void invalidate(Cluster *Cluster) {
-    InvalidKeys.insert(Cluster);
-  }
-
 private:
-  std::unordered_map<std::pair<Cluster *, Cluster *>,
-                     double,
-                     HashPair<Cluster *,Cluster *>> Cache;
-  std::unordered_set<Cluster *> InvalidKeys;
+  void invalidate(const Cluster *C) {
+    Valid.reset(C->id() * Size, (C->id() + 1) * Size);
+  }
+
+  size_t index(const Cluster *First, const Cluster *Second) const {
+    return (First->id() * Size) + Second->id();
+  }
+
+  size_t Size;
+  double *Cache;
+  BitVector Valid;
 };

-// A wrapper for algorthm-wide variables
+// A wrapper for algorithm-wide variables
 struct AlgoState {
+  explicit AlgoState(size_t Size)
+    : Cache(Size), ShortCallPairCache(Size) { }
+
  // the call graph
  const CallGraph *Cg;
  // the total number of samples in the graph
@ -130,42 +235,72 @@ struct AlgoState {
  std::vector<size_t> Addr;
  // maximum cluster id.
  size_t MaxClusterId{0};
+  // A cache that keeps precomputed values of mergeGain for pairs of clusters;
+  // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
+  // containing both x and y (and recompute them on the next iteration)
+  PrecomputedResults Cache;
+  // Cache for shortCalls for a single cluster.
+  std::unordered_map<const Cluster *, double> ShortCallCache;
+  // Cache for shortCalls for a pair of Clusters
+  PrecomputedResults ShortCallPairCache;
 };

 }

-/*
- * Sorting clusters by their density in decreasing order
- */
-void sortByDensity(std::vector<Cluster *> &Clusters) {
-  std::stable_sort(
-    Clusters.begin(),
-    Clusters.end(),
-    [&] (const Cluster *C1, const Cluster *C2) {
-      const double D1 = C1->density();
-      const double D2 = C2->density();
-      // making sure the sorting is deterministic
-      if (D1 != D2) return D1 > D2;
-      if (C1->size() != C2->size()) return C1->size() < C2->size();
-      if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
-      return C1->target(0) < C2->target(0);
-    }
-  );
-}
-
 /*
 * Density of a cluster formed by merging a given pair of clusters
 */
-double density(Cluster *ClusterPred, Cluster *ClusterSucc) {
+double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) {
  const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples();
  const double CombinedSize = ClusterPred->size() + ClusterSucc->size();
  return CombinedSamples / CombinedSize;
 }

+/*
+ * Deterministically compare clusters by their density in decreasing order.
+ */
+bool compareClusters(const Cluster *C1, const Cluster *C2) {
+  const double D1 = C1->density();
+  const double D2 = C2->density();
+  // making sure the sorting is deterministic
+  if (D1 != D2) return D1 > D2;
+  if (C1->size() != C2->size()) return C1->size() < C2->size();
+  if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
+  return C1->target(0) < C2->target(0);
+}
+
+/*
+ * Deterministically compare pairs of clusters by their density
+ * in decreasing order.
+ */
+bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
+                         const Cluster *A2, const Cluster *B2) {
+  const auto D1 = density(A1, B1);
+  const auto D2 = density(A2, B2);
+  if (D1 != D2) return D1 > D2;
+  const auto Size1 = A1->size() + B1->size();
+  const auto Size2 = A2->size() + B2->size();
+  if (Size1 != Size2) return Size1 < Size2;
+  const auto Samples1 = A1->samples() + B1->samples();
+  const auto Samples2 = A2->samples() + B2->samples();
+  if (Samples1 != Samples2) return Samples1 > Samples2;
+  return A1->target(0) < A2->target(0);
+}
+
+/*
+ * Sorting clusters by their density in decreasing order
+ */
+template <typename C>
+std::vector<Cluster *> sortByDensity(const C &Clusters_) {
+  std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
+  std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
+  return Clusters;
+}
+
 /*
 * The probability that a page with a given weight is not present in the cache.
 *
- * Assume that the hot function are called in a random order; then the
+ * Assume that the hot functions are called in a random order; then the
 * probability of a TLB page being accessed after a function call is
 * p=pageSamples/totalSamples. The probability that the page is not accessed
 * is (1-p), and the probability that it is not in the cache (i.e. not accessed
@ -194,11 +329,10 @@ double missProbability(const AlgoState &State, double PageSamples) {
 * page. The following procedure detects short and long calls, and estimates
 * the expected number of cache misses for the long ones.
 */
-double expectedCacheHitRatio(const AlgoState &State,
-                             const std::vector<Cluster *> &Clusters_) {
-  // copy and sort by density
-  std::vector<Cluster *> Clusters(Clusters_);
-  sortByDensity(Clusters);
+template <typename C>
+double expectedCacheHitRatio(const AlgoState &State, const C &Clusters_) {
+  // sort by density
+  std::vector<Cluster *> Clusters(sortByDensity(Clusters_));

  // generate function addresses with an alignment
  std::vector<size_t> Addr(State.Cg->numNodes(), InvalidAddr);
@ -247,35 +381,6 @@ double expectedCacheHitRatio(const AlgoState &State,
  return 100.0 * (1.0 - Misses / State.TotalSamples);
 }

-/*
- * Get adjacent clusters (the ones that share an arc) with the given one
- */
-std::vector<Cluster *> adjacentClusters(const AlgoState &State, Cluster *C) {
-  std::vector<Cluster *> Result;
-  Result.reserve(State.MaxClusterId);
-  for (auto TargetId : C->targets()) {
-    for (auto Succ : State.Cg->successors(TargetId)) {
-      auto SuccCluster = State.FuncCluster[Succ];
-      if (SuccCluster != nullptr && SuccCluster != C) {
-        Result.push_back(SuccCluster);
-      }
-    }
-    for (auto Pred : State.Cg->predecessors(TargetId)) {
-      auto PredCluster = State.FuncCluster[Pred];
-      if (PredCluster != nullptr && PredCluster != C) {
-        Result.push_back(PredCluster);
-      }
-    }
-  }
-  std::sort(Result.begin(), Result.end(),
-            [](const Cluster *A, const Cluster *B) {
-              return A->id() < B->id();
-            });
-  auto Last = std::unique(Result.begin(), Result.end());
-  Result.erase(Last, Result.end());
-  return Result;
-}
-
 /*
 * The expected number of calls for an edge withing the same TLB page
 */
@ -291,7 +396,13 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) {
 * The expected number of calls within a given cluster with both endpoints on
 * the same TLB cache page
 */
-double shortCalls(const AlgoState &State, Cluster *Cluster) {
+double shortCalls(AlgoState &State, const Cluster *Cluster) {
+  if (opts::UseShortCallCache) {
+    auto Itr = State.ShortCallCache.find(Cluster);
+    if (Itr != State.ShortCallCache.end())
+      return Itr->second;
+  }
+
  double Calls = 0;
  for (auto TargetId : Cluster->targets()) {
    for (auto Succ : State.Cg->successors(TargetId)) {
@ -306,6 +417,10 @@ double shortCalls(const AlgoState &State, Cluster *Cluster) {
    }
  }

+  if (opts::UseShortCallCache) {
+    State.ShortCallCache[Cluster] = Calls;
+  }
+
  return Calls;
 }

@ -313,9 +428,14 @@ double shortCalls(const AlgoState &State, Cluster *Cluster) {
 * The number of calls between the two clusters with both endpoints on
 * the same TLB page, assuming that a given pair of clusters gets merged
 */
-double shortCalls(const AlgoState &State,
-                  Cluster *ClusterPred,
-                  Cluster *ClusterSucc) {
+double shortCalls(AlgoState &State,
+                  const Cluster *ClusterPred,
+                  const Cluster *ClusterSucc) {
+  if (opts::UseShortCallCache &&
+      State.ShortCallPairCache.contains(ClusterPred, ClusterSucc)) {
+    return State.ShortCallPairCache.get(ClusterPred, ClusterSucc);
+  }
+
  double Calls = 0;
  for (auto TargetId : ClusterPred->targets()) {
    for (auto Succ : State.Cg->successors(TargetId)) {
@ -344,6 +464,10 @@ double shortCalls(const AlgoState &State,
    }
  }

+  if (opts::UseShortCallCache) {
+    State.ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls);
+  }
+
  return Calls;
 }

@ -359,9 +483,13 @@ double shortCalls(const AlgoState &State,
 * increse the chance of merging short clusters, which is helpful for
 * the i-cache performance.
 */
-double mergeGain(const AlgoState &State,
-                 Cluster *ClusterPred,
-                 Cluster *ClusterSucc) {
+double mergeGain(AlgoState &State,
+                 const Cluster *ClusterPred,
+                 const Cluster *ClusterSucc) {
+  if (opts::UseGainCache && State.Cache.contains(ClusterPred, ClusterSucc)) {
+    return State.Cache.get(ClusterPred, ClusterSucc);
+  }
+
  // cache misses on the first cluster
  double LongCallsPred = ClusterPred->samples() - shortCalls(State, ClusterPred);
  double ProbPred = missProbability(State, ClusterPred->density() * PageSize);
@ -381,7 +509,20 @@ double mergeGain(const AlgoState &State,

  double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew;
  // scaling the result to increase the importance of merging short clusters
-  return Gain / (ClusterPred->size() + ClusterSucc->size());
+  Gain /= (ClusterPred->size() + ClusterSucc->size());
+
+  if (opts::UseGainCache) {
+    State.Cache.set(ClusterPred, ClusterSucc, Gain);
+  }
+
+  return Gain;
+}
+
+template <typename C, typename V>
+void maybeErase(C &Container, const V& Value) {
+  auto Itr = Container.find(Value);
+  if (Itr != Container.end())
+    Container.erase(Itr);
 }

 /*
@ -393,37 +534,35 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
  AllClusters.reserve(Cg.numNodes());
  for (NodeId F = 0; F < Cg.numNodes(); F++) {
    AllClusters.emplace_back(F, Cg.getNode(F));
-    AllClusters.back().setId(F);
  }

  // initialize objects used by the algorithm
  std::vector<Cluster *> Clusters;
  Clusters.reserve(Cg.numNodes());
-  AlgoState State;
+  AlgoState State(AllClusters.size()); // TODO: should use final Clusters.size()
  State.Cg = &Cg;
  State.TotalSamples = 0;
  State.FuncCluster = std::vector<Cluster *>(Cg.numNodes(), nullptr);
-  State.Addr = std::vector<size_t>(Cg.numNodes(), InvalidAddr); 
-  if (!AllClusters.empty()) {
-    State.MaxClusterId = AllClusters.back().id();
-  }
+  State.Addr = std::vector<size_t>(Cg.numNodes(), InvalidAddr);
+  uint32_t Id = 0;
  for (NodeId F = 0; F < Cg.numNodes(); F++) {
    if (Cg.samples(F) == 0) continue;
    Clusters.push_back(&AllClusters[F]);
+    Clusters.back()->setId(Id);
    State.FuncCluster[F] = &AllClusters[F];
    State.Addr[F] = 0;
    State.TotalSamples += Cg.samples(F);
+    ++Id;
  }
+  State.MaxClusterId = Id;

-  DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n"
+  AdjacencyMatrix Adjacent(Cg, Clusters, State.FuncCluster);
+
+  DEBUG(dbgs() << "Starting hfsort+ w/" << opts::cacheKindString() << " for "
+               << Clusters.size() << " clusters\n"
               << format("Initial expected iTLB cache hit ratio: %.4lf\n",
                         expectedCacheHitRatio(State, Clusters)));

-  // the cache keeps precomputed values of mergeGain for pairs of clusters;
-  // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
-  // containing both x and y (and recompute them on the next iteration)
-  PrecomputedResults Cache;
-
  int Steps = 0;
  // merge pairs of clusters while there is an improvement
  while (Clusters.size() > 1) {
@ -435,44 +574,46 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
                         expectedCacheHitRatio(State, Clusters));
      }
    );
-    Steps++;
+    ++Steps;

    Cluster *BestClusterPred = nullptr;
    Cluster *BestClusterSucc = nullptr;
    double BestGain = -1;
    for (auto ClusterPred : Clusters) {
      // get candidates for merging with the current cluster
-      auto CandidateClusters = adjacentClusters(State, ClusterPred);
+      Adjacent.forallAdjacent(
+        ClusterPred,
+        // find the best candidate
+        [&](Cluster *ClusterSucc) {
+          assert(ClusterPred != ClusterSucc);
+          // get a cost of merging two clusters
+          const double Gain = mergeGain(State, ClusterPred, ClusterSucc);

-      // find the best candidate
-      for (auto ClusterSucc : CandidateClusters) {
-        // get a cost of merging two clusters
-        if (!Cache.contains(ClusterPred, ClusterSucc)) {
-          double Value = mergeGain(State, ClusterPred, ClusterSucc);
-          Cache.set(ClusterPred, ClusterSucc, Value);
-          assert(Cache.contains(ClusterPred, ClusterSucc));
+          // breaking ties by density to make the hottest clusters be merged first
+          if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
+                                  compareClusterPairs(ClusterPred,
+                                                      ClusterSucc,
+                                                      BestClusterPred,
+                                                      BestClusterSucc))) {
+            BestGain = Gain;
+            BestClusterPred = ClusterPred;
+            BestClusterSucc = ClusterSucc;
+          }
        }
-
-        double Gain = Cache.get(ClusterPred, ClusterSucc);
-        // breaking ties by density to make the hottest clusters be merged first
-        if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
-                                density(ClusterPred, ClusterSucc) >
-                                density(BestClusterPred, BestClusterSucc))) {
-          BestGain = Gain;
-          BestClusterPred = ClusterPred;
-          BestClusterSucc = ClusterSucc;
-        }
-      }
+      );
    }
-    Cache.validateAll();

    if (BestGain <= 0.0) break;

-    Cache.invalidate(BestClusterPred);
-    Cache.invalidate(BestClusterSucc);
-
    // merge the best pair of clusters
-    BestClusterPred->merge(std::move(*BestClusterSucc));
+    DEBUG(
+      if (opts::Verbosity > 0) {
+        dbgs() << "Merging cluster " << BestClusterSucc->id()
+               << " into cluster " << BestClusterPred->id() << "\n";
+      });
+
+    Adjacent.merge(BestClusterPred, BestClusterSucc);
+    BestClusterPred->merge(*BestClusterSucc);

    size_t CurAddr = 0;
    for (auto TargetId : BestClusterPred->targets()) {
@ -481,6 +622,18 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
      CurAddr += State.Cg->size(TargetId);
    }

+    if (opts::UseShortCallCache) {
+      maybeErase(State.ShortCallCache, BestClusterPred);
+      Adjacent.forallAdjacent(BestClusterPred,
+                              [&State](const Cluster *C) {
+                                maybeErase(State.ShortCallCache, C);
+                              });
+      State.ShortCallPairCache.invalidate(Adjacent, BestClusterPred);
+    }
+    if (opts::UseGainCache) {
+      State.Cache.invalidate(Adjacent, BestClusterPred);
+    }
+
    // remove BestClusterSucc from the list of active clusters
    auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc);
    Clusters.erase(Iter, Clusters.end());
@ -492,9 +645,8 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {

  // Return the set of clusters that are left, which are the ones that
  // didn't get merged (so their first func is its original func).
-  sortByDensity(Clusters);
  std::vector<Cluster> Result;
-  for (auto Cluster : Clusters) {
+  for (auto Cluster : sortByDensity(Clusters)) {
    Result.emplace_back(std::move(*Cluster));
  }

--- a/bolt/Passes/PettisAndHansen.cpp
+++ b/bolt/Passes/PettisAndHansen.cpp
@ -179,7 +179,8 @@ std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {
    for (auto F : C2->targets()) {
      FuncCluster[F] = C1;
    }
-    C1->merge(std::move(*C2), Max.Weight);
+    C1->merge(*C2, Max.Weight);
+    C2->clear();
  }

  // Return the set of Clusters that are left, which are the ones that