diff --git a/bolt/CacheMetrics.cpp b/bolt/CacheMetrics.cpp
index bd723b80629d..638872dc67c2 100644
--- a/bolt/CacheMetrics.cpp
+++ b/bolt/CacheMetrics.cpp
@@ -8,26 +8,65 @@
 //===----------------------------------------------------------------------===//
 
 #include "CacheMetrics.h"
+#include "llvm/Support/Options.h"
 
 using namespace llvm;
 using namespace bolt;
-using Traversal = std::vector<BinaryBasicBlock *>;
 
-// The weight of fallthrough jumps for ExtTSP metric
-constexpr double FallthroughWeight = 1.0;
-// The weight of forward jumps for ExtTSP metric
-constexpr double ForwardWeight = 1.0;
-// The weight of backward jumps for ExtTSP metric
-constexpr double BackwardWeight = 1.0;
-// The maximum distance (in bytes) of forward jumps for ExtTSP metric
-constexpr uint64_t ForwardDistance = 256;
-// The maximum distance (in bytes) of backward jumps for ExtTSP metric
-constexpr uint64_t BackwardDistance = 256;
+namespace opts {
 
-// The size of the i-TLB cache page
-constexpr uint64_t ITLBPageSize = 4096;
-// Capacity of the i-TLB cache
-constexpr uint64_t ITLBEntries = 16;
+extern cl::OptionCategory BoltOptCategory;
+
+cl::opt<double>
+FallthroughWeight("fallthrough-weight",
+  cl::desc("The weight of forward jumps for ExtTSP metric"),
+  cl::init(1),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<double>
+ForwardWeight("forward-weight",
+  cl::desc("The weight of forward jumps for ExtTSP metric"),
+  cl::init(0.4),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<double>
+BackwardWeight("backward-weight",
+  cl::desc("The weight of backward jumps for ExtTSP metric"),
+  cl::init(0.4),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+ForwardDistance("forward-distance",
+  cl::desc("The maximum distance (in bytes) of forward jumps for ExtTSP metric"),
+  cl::init(768),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+BackwardDistance("backward-distance",
+  cl::desc("The maximum distance (in bytes) of backward jumps for ExtTSP metric"),
+  cl::init(192),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+ITLBPageSize("itlb-page-size",
+  cl::desc("The size of i-tlb cache page"),
+  cl::init(4096),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+ITLBEntries("itlb-entries",
+  cl::desc("The number of entries in i-tlb cache"),
+  cl::init(16),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+}
 
 namespace {
 
@@ -46,104 +85,6 @@ void extractBasicBlockInfo(
   }
 }
 
-/// Initialize and return a vector of traversals for a given entry block
-std::vector<Traversal> getTraversals(BinaryBasicBlock *EntryBB) {
-  std::vector<Traversal> AllTraversals;
-  std::stack<std::pair<BinaryBasicBlock *, Traversal>> Stack;
-  Stack.push(std::make_pair(EntryBB, Traversal()));
-  std::unordered_set<BinaryBasicBlock *> BBSet;
-
-  while (!Stack.empty()) {
-    BinaryBasicBlock *CurrentBB = Stack.top().first;
-    Traversal PrevTraversal(Stack.top().second);
-    Stack.pop();
-
-    // Add current basic block into consideration
-    BBSet.insert(CurrentBB);
-    PrevTraversal.push_back(CurrentBB);
-
-    if (CurrentBB->succ_empty()) {
-      AllTraversals.push_back(PrevTraversal);
-      continue;
-    }
-
-    bool HaveSuccCount = false;
-    // Calculate total edges count of successors
-    for (auto BI = CurrentBB->branch_info_begin();
-         BI != CurrentBB->branch_info_end(); ++BI) {
-      if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
-        HaveSuccCount = true;
-        break;
-      }
-    }
-    if (!HaveSuccCount) {
-      AllTraversals.push_back(PrevTraversal);
-      continue;
-    }
-
-    auto BI = CurrentBB->branch_info_begin();
-    for (auto *SuccBB : CurrentBB->successors()) {
-      // If we have never seen SuccBB, or SuccBB indicates the
-      // end of traversal, SuccBB will be added into stack for
-      // further exploring.
-      if ((BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 &&
-           BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) ||
-          SuccBB->succ_empty()) {
-        Stack.push(std::make_pair(SuccBB, PrevTraversal));
-      }
-      ++BI;
-    }
-  }
-
-  return AllTraversals;
-}
-
-/// Given a traversal, return the sum of block distances along this traversal.
-double getTraversalLength(
-  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
-  const Traversal &Path) {
-  double Length = 0;
-  for (size_t I = 0; I + 1 < Path.size(); I++) {
-    // Ignore calls between hot and cold parts
-    if (Path[I]->isCold() != Path[I + 1]->isCold())
-      continue;
-    double SrcAddr = BBAddr.at(Path[I]);
-    double DstAddr = BBAddr.at(Path[I + 1]);
-    Length += std::abs(SrcAddr - DstAddr);
-  }
-  return Length;
-}
-
-/// Calculate average number of call distance for every graph traversal
-double calcGraphDistance(
-  const std::vector<BinaryFunction *> &BinaryFunctions,
-  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
-  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
-
-  double TotalTraversalLength = 0;
-  double NumTraversals = 0;
-  for (auto BF : BinaryFunctions) {
-    // Only consider functions which are known to be executed
-    if (BF->getKnownExecutionCount() == 0)
-      continue;
-
-    for (auto BB : BF->layout()) {
-      if (BB->isEntryPoint()) {
-        auto AllTraversals = getTraversals(BB);
-        for (auto const &Path : AllTraversals) {
-          // Ignore short traversals
-          if (Path.size() <= 1)
-            continue;
-          TotalTraversalLength += getTraversalLength(BBAddr, Path);
-          NumTraversals++;
-        }
-      }
-    }
-  }
-
-  return TotalTraversalLength / NumTraversals;
-}
-
 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
 /// the ordering of basic blocks
 double calcTSPScore(
@@ -166,22 +107,12 @@ double calcTSPScore(
   return Score;
 }
 
-/// Calculate Extended-TSP metric, which quantifies the expected number of
-/// i-cache misses for a given ordering of basic blocks. The parameters are:
-/// - FallthroughWeight is the impact of fallthrough jumps on the score
-/// - ForwardWeight is the impact of forward (but not fallthrough) jumps
-/// - BackwardWeight is the impact of backward jumps
-/// - ForwardDistance is the max distance of a forward jump affecting the score
-/// - BackwardDistance is the max distance of a backward jump affecting the score
+/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
+/// misses for a given ordering of basic blocks
 double calcExtTSPScore(
   const std::vector<BinaryFunction *> &BinaryFunctions,
   const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
-  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize,
-  double FallthroughWeight,
-  double ForwardWeight,
-  double BackwardWeight,
-  uint64_t ForwardDistance,
-  uint64_t BackwardDistance) {
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
 
   double Score = 0.0;
   for (auto BF : BinaryFunctions) {
@@ -189,33 +120,10 @@ double calcExtTSPScore(
       auto BI = SrcBB->branch_info_begin();
       for (auto DstBB : SrcBB->successors()) {
         if (DstBB != SrcBB) {
-          double Count = BI->Count == BinaryBasicBlock::COUNT_NO_PROFILE
-                         ? 0.0
-                         : double(BI->Count);
-          uint64_t SrcAddr = BBAddr.at(SrcBB);
-          uint64_t SrcSize = BBSize.at(SrcBB);
-          uint64_t DstAddr = BBAddr.at(DstBB);
-
-          if (SrcAddr <= DstAddr) {
-            if (SrcAddr + SrcSize == DstAddr) {
-              // fallthrough jump
-              Score += FallthroughWeight * Count;
-            } else {
-              // the distance of the forward jump
-              size_t Dist = DstAddr - (SrcAddr + SrcSize);
-              if (Dist <= ForwardDistance) {
-                double Prob = double(ForwardDistance - Dist) / ForwardDistance;
-                Score += ForwardWeight * Prob * Count;
-              }
-            }
-          } else {
-            // the distance of the backward jump
-            size_t Dist = SrcAddr + SrcSize - DstAddr;
-            if (Dist <= BackwardDistance) {
-              double Prob = double(BackwardDistance - Dist) / BackwardDistance;
-              Score += BackwardWeight * Prob * Count;
-            }
-          }
+          Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB),
+                                             BBSize.at(SrcBB),
+                                             BBAddr.at(DstBB),
+                                             BI->Count);
         }
         ++BI;
       }
@@ -277,10 +185,10 @@ extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
 double expectedCacheHitRatio(
   const std::vector<BinaryFunction *> &BinaryFunctions,
   const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
-  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize,
-  double PageSize,
-  uint64_t CacheEntries) {
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
 
+  const double PageSize = opts::ITLBPageSize;
+  const uint64_t CacheEntries = opts::ITLBEntries;
   auto Calls = extractFunctionCalls(BinaryFunctions);
   // Compute 'hotness' of the functions
   double TotalSamples = 0;
@@ -334,6 +242,34 @@ double expectedCacheHitRatio(
   return 100.0 * (1.0 - Misses / TotalSamples);
 }
 
+} // end namespace anonymous
+
+double CacheMetrics::extTSPScore(uint64_t SrcAddr,
+                                 uint64_t SrcSize,
+                                 uint64_t DstAddr,
+                                 uint64_t Count) {
+  assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
+
+  // Fallthrough
+  if (SrcAddr + SrcSize == DstAddr) {
+    return opts::FallthroughWeight * Count;
+  }
+  // Forward
+  if (SrcAddr + SrcSize < DstAddr) {
+    const auto Dist = DstAddr - (SrcAddr + SrcSize);
+    if (Dist <= opts::ForwardDistance) {
+      double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
+      return opts::ForwardWeight * Prob * Count;
+    }
+    return 0;
+  }
+  // Backward
+  const auto Dist = SrcAddr + SrcSize - DstAddr;
+  if (Dist <= opts::BackwardDistance) {
+    double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
+    return opts::BackwardWeight * Prob * Count;
+  }
+  return 0;
 }
 
 void CacheMetrics::printAll(
@@ -356,10 +292,10 @@ void CacheMetrics::printAll(
   }
 
   outs() << format("  There are %zu functions;", NumFunctions)
-         << format(" %zu (%.2lf%%) have non-empty execution count\n",
+         << format(" %zu (%.2lf%%) have positive execution count\n",
                    NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions);
   outs() << format("  There are %zu basic blocks;", NumBlocks)
-         << format(" %zu (%.2lf%%) have non-empty execution count\n",
+         << format(" %zu (%.2lf%%) have positive execution count\n",
                   NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks);
 
   std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
@@ -377,35 +313,14 @@ void CacheMetrics::printAll(
   outs() << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu)\n",
                    100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize);
 
-  outs() << "  An average length of graph traversal: "
-         << format("%.0lf\n", calcGraphDistance(BinaryFunctions,
-                                                BBAddr,
-                                                BBSize));
-
-  outs() << "  Expected i-TLB cache hit ratio "
-         << format("(%zu, %zu): ", ITLBPageSize, ITLBEntries)
+  outs() << "  Expected i-TLB cache hit ratio: "
          << format("%.2lf%%\n", expectedCacheHitRatio(BinaryFunctions,
                                                       BBAddr,
-                                                      BBSize,
-                                                      ITLBPageSize,
-                                                      ITLBEntries));
+                                                      BBSize));
 
   outs() << "  TSP score: "
          << format("%.0lf\n", calcTSPScore(BinaryFunctions, BBAddr, BBSize));
 
-  outs() << "  ExtTSP score "
-         << format("(%.2lf, %.2lf, %.2lf, %zu, %zu): ", FallthroughWeight,
-                                                        ForwardWeight,
-                                                        BackwardWeight,
-                                                        ForwardDistance,
-                                                        BackwardDistance)
-         << format("%.0lf\n", calcExtTSPScore(BinaryFunctions,
-                                              BBAddr,
-                                              BBSize,
-                                              FallthroughWeight,
-                                              ForwardWeight,
-                                              BackwardWeight,
-                                              ForwardDistance,
-                                              BackwardDistance));
-
+  outs() << "  ExtTSP score: "
+         << format("%.0lf\n", calcExtTSPScore(BinaryFunctions, BBAddr, BBSize));
 }
diff --git a/bolt/CacheMetrics.h b/bolt/CacheMetrics.h
index 1dab4565bc34..b512168ebaf3 100644
--- a/bolt/CacheMetrics.h
+++ b/bolt/CacheMetrics.h
@@ -20,6 +20,17 @@ namespace CacheMetrics {
 /// Calculate various metrics related to instruction cache performance.
 void printAll(const std::vector<BinaryFunction *> &BinaryFunctions);
 
+/// Calculate Extended-TSP metric, which quantifies the expected number of
+/// i-cache misses for a given pair of basic blocks. The parameters are:
+/// - SrcAddr is the address of the source block;
+/// - SrcSize is the size of the source block;
+/// - DstAddr is the address of the destination block;
+/// - Count is the number of jumps between the pair of blocks.
+double extTSPScore(uint64_t SrcAddr,
+                   uint64_t SrcSize,
+                   uint64_t DstAddr,
+                   uint64_t Count);
+                       
 } // namespace CacheMetrics
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp
index 6f78bb856f2c..b90cc0f10c52 100644
--- a/bolt/Passes/BinaryPasses.cpp
+++ b/bolt/Passes/BinaryPasses.cpp
@@ -161,6 +161,9 @@ ReorderBlocks("reorder-blocks",
       "cache",
       "perform optimal layout prioritizing I-cache "
       "behavior"),
+    clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS,
+      "cache+",
+      "perform layout optimizing I-cache behavior"),
     clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
       "cluster-shuffle",
       "perform random layout of clusters"),
@@ -469,6 +472,10 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
       break;
 
+    case LT_OPTIMIZE_CACHE_PLUS:
+      Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo)));
+      break;
+
     case LT_OPTIMIZE_SHUFFLE:
       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
       break;
diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h
index ea7376f7997b..0ef8e9027d55 100644
--- a/bolt/Passes/BinaryPasses.h
+++ b/bolt/Passes/BinaryPasses.h
@@ -169,6 +169,8 @@ public:
     /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04)
     /// that suggests putting frequently executed chains first in the layout.
     LT_OPTIMIZE_CACHE,
+    /// Block reordering guided by the extended TSP metric.
+    LT_OPTIMIZE_CACHE_PLUS,
     /// Create clusters and use random order for them.
     LT_OPTIMIZE_SHUFFLE,
   };
diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt
index f9b8db8703af..0255e7b40048 100644
--- a/bolt/Passes/CMakeLists.txt
+++ b/bolt/Passes/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMBOLTPasses
   BinaryFunctionCallGraph.cpp
   CallGraph.cpp
   CallGraphWalker.cpp
+  CachePlusReorderAlgorithm.cpp
   DataflowAnalysis.cpp
   DataflowInfoManager.cpp
   FrameAnalysis.cpp
diff --git a/bolt/Passes/CachePlusReorderAlgorithm.cpp b/bolt/Passes/CachePlusReorderAlgorithm.cpp
new file mode 100644
index 000000000000..5a717aaec5e2
--- /dev/null
+++ b/bolt/Passes/CachePlusReorderAlgorithm.cpp
@@ -0,0 +1,476 @@
+//===--- CachePlusReorderAlgorithm.cpp - Order basic blocks ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "CacheMetrics.h"
+#include "ReorderAlgorithm.h"
+#include "ReorderUtils.h"
+
+using namespace llvm;
+using namespace bolt;
+using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// A cluster (ordered sequence) of basic blocks
+class Cluster {
+public:
+  Cluster(BinaryBasicBlock *BB, uint64_t ExecutionCount_, uint64_t Size_)
+  : Id(BB->getLayoutIndex()),
+    IsEntry(BB->getLayoutIndex() == 0),
+    ExecutionCount(ExecutionCount_),
+    Size(Size_),
+    Score(0) {
+    Blocks.push_back(BB);
+  }
+
+  size_t id() const {
+    return Id;
+  }
+
+  uint64_t size() const {
+    return Size;
+  }
+
+  double density() const {
+    return static_cast<double>(ExecutionCount) / Size;
+  }
+
+  bool isCold() const {
+    return ExecutionCount == 0;
+  }
+
+  uint64_t executionCount() const {
+    return ExecutionCount;
+  }
+
+  bool isEntryPoint() const {
+    return IsEntry;
+  }
+
+  double score() const {
+    return Score;
+  }
+
+  const std::vector<BinaryBasicBlock *> &blocks() const {
+    return Blocks;
+  }
+
+  /// Update the list of basic blocks and meta-info
+  void merge(const Cluster *Other,
+             const std::vector<BinaryBasicBlock *> &MergedBlocks,
+             double MergedScore) {
+    Blocks = MergedBlocks;
+    IsEntry |= Other->IsEntry;
+    ExecutionCount += Other->ExecutionCount;
+    Size += Other->Size;
+    Score = MergedScore;
+  }
+
+private:
+  std::vector<BinaryBasicBlock *> Blocks;
+  size_t Id;
+  bool IsEntry;
+  uint64_t ExecutionCount;
+  uint64_t Size;
+  double Score;
+};
+
+/// Deterministically compare clusters by their density in decreasing order
+bool compareClusters(const Cluster *C1, const Cluster *C2) {
+  // original entry point to the front
+  if (C1->isEntryPoint())
+    return true;
+  if (C2->isEntryPoint())
+    return false;
+
+  const double D1 = C1->density();
+  const double D2 = C2->density();
+  if (D1 != D2)
+    return D1 > D2;
+  // Making the order deterministic
+  return C1->id() < C2->id();
+}
+
+/// Deterministically compare pairs of clusters
+bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
+                         const Cluster *A2, const Cluster *B2) {
+  const auto Samples1 = A1->executionCount() + B1->executionCount();
+  const auto Samples2 = A2->executionCount() + B2->executionCount();
+  if (Samples1 != Samples2)
+    return Samples1 < Samples2;
+
+  if (A1 != A2)
+    return A1->id() < A2->id();
+  return B1->id() < B2->id();
+}
+
+} // end namespace anonymous
+
+/// CachePlus - layout of basic blocks with i-cache optimization.
+///
+/// Similarly to OptimizeCacheReorderAlgorithm, this algorithm is a greedy
+/// heuristic that works with clusters (ordered sequences) of basic blocks.
+/// Initially all clusters are isolated basic blocks. On every iteration,
+/// we pick a pair of clusters whose merging yields the biggest increase in
+/// the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which
+/// models how i-cache "friendly" a specific cluster is. A pair of clusters
+/// giving the maximum gain is merged into a new cluster. The procedure stops
+/// when there is only one cluster left, or when merging does not increase
+/// ExtTSP. In the latter case, the remaining clusters are sorted by density.
+///
+/// An important aspect is the way two clusters are merged. Unlike earlier
+/// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
+/// clusters, X and Y, are first split into three, X1, X2, and Y. Then we
+/// consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y,
+/// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
+/// This improves the quality of the final result (the search space is larger)
+/// while keeping the implementation sufficiently fast.
+class CachePlus {
+public:
+  CachePlus(const BinaryFunction &BF)
+  : BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) {
+    initialize();
+  }
+
+  /// Run cache+ algorithm and return a basic block ordering
+  std::vector<BinaryBasicBlock *> run() {
+    // Merge pairs of clusters while there is an improvement in ExtTSP metric
+    while (Clusters.size() > 1) {
+      Cluster *BestClusterPred = nullptr;
+      Cluster *BestClusterSucc = nullptr;
+      std::pair<double, size_t> BestGain(-1, 0);
+      for (auto ClusterPred : Clusters) {
+        // Get candidates for merging with the current cluster
+        Adjacent.forAllAdjacent(
+          ClusterPred,
+          // Find the best candidate
+          [&](Cluster *ClusterSucc) {
+            assert(ClusterPred != ClusterSucc && "loop edges are not supported");
+            // Do not merge cold blocks
+            if (ClusterPred->isCold() || ClusterSucc->isCold())
+              return;
+
+            // Compute the gain of merging two clusters
+            auto Gain = mergeGain(ClusterPred, ClusterSucc);
+            if (Gain.first <= 0.0)
+              return;
+
+            // Breaking ties by density to make the hottest clusters be merged first
+            if (Gain.first > BestGain.first ||
+                (std::abs(Gain.first - BestGain.first) < 1e-8 &&
+                 compareClusterPairs(ClusterPred,
+                                     ClusterSucc,
+                                     BestClusterPred,
+                                     BestClusterSucc))) {
+              BestGain = Gain;
+              BestClusterPred = ClusterPred;
+              BestClusterSucc = ClusterSucc;
+            }
+          });
+      }
+
+      // Stop merging when there is no improvement
+      if (BestGain.first <= 0.0)
+        break;
+
+      // Merge the best pair of clusters
+      mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
+    }
+
+    // Sorting clusters by density
+    std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
+
+    // Collect the basic blocks in the order specified by their clusters
+    std::vector<BinaryBasicBlock *> Result;
+    Result.reserve(BF.layout_size());
+    for (auto Cluster : Clusters) {
+      Result.insert(Result.end(),
+                    Cluster->blocks().begin(),
+                    Cluster->blocks().end());
+    }
+
+    return Result;
+  }
+
+private:
+  /// Initialize the set of active clusters, edges between blocks, and
+  /// adjacency matrix.
+  void initialize() {
+    // Initialize indices of basic blocks
+    size_t LayoutIndex = 0;
+    for (auto BB : BF.layout()) {
+      BB->setLayoutIndex(LayoutIndex);
+      LayoutIndex++;
+    }
+
+    // Initialize edges for the blocks and compute their total in/out weights
+    OutEdges = std::vector<EdgeList>(BF.layout_size());
+    auto InWeight = std::vector<uint64_t>(BF.layout_size(), 0);
+    auto OutWeight = std::vector<uint64_t>(BF.layout_size(), 0);
+    for (auto BB : BF.layout()) {
+      auto BI = BB->branch_info_begin();
+      for (auto I : BB->successors()) {
+        assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+               "missing profile for a jump");
+        if (I != BB && BI->Count > 0) {
+          InWeight[I->getLayoutIndex()] += BI->Count;
+          OutEdges[BB->getLayoutIndex()].push_back(std::make_pair(I, BI->Count));
+          OutWeight[BB->getLayoutIndex()] += BI->Count;
+        }
+        ++BI;
+      }
+    }
+
+    // Initialize execution count for every basic block, which is the
+    // maximum over the sums of all in and out edge weights.
+    // Also execution count of the entry point is set to at least 1
+    auto ExecutionCounts = std::vector<uint64_t>(BF.layout_size(), 0);
+    for (auto BB : BF.layout()) {
+      uint64_t EC = BB->getKnownExecutionCount();
+      EC = std::max(EC, InWeight[BB->getLayoutIndex()]);
+      EC = std::max(EC, OutWeight[BB->getLayoutIndex()]);
+      if (BB->getLayoutIndex() == 0)
+        EC = std::max(EC, uint64_t(1));
+      ExecutionCounts[BB->getLayoutIndex()] = EC;
+    }
+
+    // Initialize clusters
+    Clusters.reserve(BF.layout_size());
+    AllClusters.reserve(BF.layout_size());
+    Size.reserve(BF.layout_size());
+    for (auto BB : BF.layout()) {
+      size_t Index = BB->getLayoutIndex();
+      Size.push_back(std::max(BB->estimateSize(), size_t(1)));
+      AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
+      Clusters.push_back(&AllClusters[Index]);
+    }
+
+    // Initialize adjacency matrix
+    Adjacent.initialize(Clusters);
+    for (auto BB : BF.layout()) {
+      for (auto I : BB->successors()) {
+        if (BB != I)
+          Adjacent.set(Clusters[BB->getLayoutIndex()],
+                       Clusters[I->getLayoutIndex()]);
+      }
+    }
+  }
+
+  /// Compute ExtTSP score for a given order of basic blocks
+  double score(const std::vector<BinaryBasicBlock *>& Blocks) const {
+    uint64_t NotSet = static_cast<uint64_t>(-1);
+    auto Addr = std::vector<uint64_t>(BF.layout_size(), NotSet);
+    uint64_t CurAddr = 0;
+    for (auto BB : Blocks) {
+      size_t Index = BB->getLayoutIndex();
+      Addr[Index] = CurAddr;
+      CurAddr += Size[Index];
+    }
+
+    double Score = 0;
+    for (auto BB : Blocks) {
+      size_t Index = BB->getLayoutIndex();
+      for (auto Edge : OutEdges[Index]) {
+        auto SuccBB = Edge.first;
+        size_t SuccIndex = SuccBB->getLayoutIndex();
+
+        if (Addr[SuccBB->getLayoutIndex()] != NotSet) {
+          Score += CacheMetrics::extTSPScore(Addr[Index],
+                                             Size[Index],
+                                             Addr[SuccIndex],
+                                             Edge.second);
+        }
+      }
+    }
+    return Score;
+  }
+
+  /// The gain of merging two clusters.
+  ///
+  /// The function considers all possible ways of merging two clusters and
+  /// computes the one having the largest increase in ExtTSP metric. The result
+  /// is a pair with the first element being the gain and the second element being
+  /// the corresponding merging type (encoded as an integer).
+  std::pair<double, size_t> mergeGain(const Cluster *ClusterPred,
+                                      const Cluster *ClusterSucc) const {
+    if (Cache.contains(ClusterPred, ClusterSucc)) {
+      return Cache.get(ClusterPred, ClusterSucc);
+    }
+
+    // The current score of two separate clusters
+    const auto CurScore = ClusterPred->score() + ClusterSucc->score();
+
+    // Merge two clusters and update the best Gain
+    auto computeMergeGain = [&](const std::pair<double, size_t> &CurGain,
+                                const Cluster *ClusterPred,
+                                const Cluster *ClusterSucc,
+                                size_t MergeType) {
+      auto MergedBlocks = mergeBlocks(ClusterPred->blocks(),
+                                      ClusterSucc->blocks(),
+                                      MergeType);
+      // Does the new cluster preserve the original entry point?
+      if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
+          MergedBlocks[0]->getLayoutIndex() != 0)
+        return CurGain;
+
+      // The score of the new cluster
+      const auto NewScore = score(MergedBlocks);
+      if (NewScore > CurScore && NewScore - CurScore > CurGain.first)
+        return std::make_pair(NewScore - CurScore, MergeType);
+      else
+        return CurGain;
+    };
+
+    std::pair<double, size_t> Gain = std::make_pair(-1, 0);
+    // Try to simply concatenate two clusters
+    Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
+    // Try to split ClusterPred into two and merge with ClusterSucc
+    for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
+      for (size_t Type = 0; Type < 4; Type++) {
+        size_t MergeType = 1 + Type + Offset * 4;
+        Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
+      }
+    }
+
+    Cache.set(ClusterPred, ClusterSucc, Gain);
+    return Gain;
+  }
+
+  /// Merge two clusters (orders) of blocks according to a given 'merge type'.
+  ///
+  /// If MergeType == 0, then the results is a concatentation of two clusters.
+  /// Otherwise, the first cluster is cut into two and we consider all possible
+  /// ways of concatenating three clusters.
+  std::vector<BinaryBasicBlock *> mergeBlocks(
+    const std::vector<BinaryBasicBlock *> &X,
+    const std::vector<BinaryBasicBlock *> &Y,
+    size_t MergeType
+  ) const {
+    // Concatenate three clusters of blocks in the given order
+    auto concat = [&](const std::vector<BinaryBasicBlock *> &A,
+                      const std::vector<BinaryBasicBlock *> &B,
+                      const std::vector<BinaryBasicBlock *> &C) {
+      std::vector<BinaryBasicBlock *> Result;
+      Result.reserve(A.size() + B.size() + C.size());
+      Result.insert(Result.end(), A.begin(), A.end());
+      Result.insert(Result.end(), B.begin(), B.end());
+      Result.insert(Result.end(), C.begin(), C.end());
+      return Result;
+    };
+
+    // Merging w/o splitting existing clusters
+    if (MergeType == 0) {
+      return concat(X, Y, std::vector<BinaryBasicBlock *>());
+    }
+
+    MergeType--;
+    size_t Type = MergeType % 4;
+    size_t Offset = MergeType / 4;
+    assert(0 < Offset && Offset < X.size() &&
+           "Invalid offset while merging clusters");
+    // Split the first cluster, X, into X1 and X2
+    std::vector<BinaryBasicBlock *> X1(X.begin(), X.begin() + Offset);
+    std::vector<BinaryBasicBlock *> X2(X.begin() + Offset, X.end());
+
+    // Construct a new cluster from three existing ones
+    switch(Type) {
+    case 0: return concat(X1, Y, X2);
+    case 1: return concat(Y, X2, X1);
+    case 2: return concat(X2, Y, X1);
+    case 3: return concat(X2, X1, Y);
+    default:
+      llvm_unreachable("unexpected merge type");
+    }
+  }
+
+  /// Merge cluster From into cluster Into, update the list of active clusters,
+  /// adjacency information, and the corresponding cache.
+  void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) {
+    assert(Into != From && "Cluster cannot be merged with itself");
+    // Merge the clusters
+    auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
+    Into->merge(From, MergedBlocks, score(MergedBlocks));
+
+    // Remove cluster From from the list of active clusters
+    auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
+    Clusters.erase(Iter, Clusters.end());
+
+    // Invalidate caches
+    Cache.invalidate(Into);
+
+    // Update the adjacency matrix
+    Adjacent.merge(Into, From);
+  }
+
+  // The binary function
+  const BinaryFunction &BF;
+
+  // All clusters
+  std::vector<Cluster> AllClusters;
+
+  // Active clusters. The vector gets udpated at runtime when clusters are merged
+  std::vector<Cluster *> Clusters;
+
+  // Size of the block
+  std::vector<uint64_t> Size;
+
+  // Outgoing edges of the block
+  std::vector<EdgeList> OutEdges;
+
+  // Cluster adjacency matrix
+  AdjacencyMatrix<Cluster> Adjacent;
+
+  // A cache that keeps precomputed values of mergeGain for pairs of clusters;
+  // when a pair of clusters (x,y) gets merged, we invalidate the pairs
+  // containing both x and y and all clusters adjacent to x and y (and recompute
+  // them on the next iteration).
+  mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;
+};
+
+void CachePlusReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Are there jumps with positive execution count?
+  uint64_t SumCount = 0;
+  for (auto BB : BF.layout()) {
+    auto BI = BB->branch_info_begin();
+    for (auto I : BB->successors()) {
+      assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr);
+      SumCount += BI->Count;
+      ++BI;
+    }
+  }
+
+  // Do not change layout of functions w/o profile information
+  if (SumCount == 0) {
+    for (auto BB : BF.layout()) {
+      Order.push_back(BB);
+    }
+    return;
+  }
+
+  // Apply the algorithm
+  Order = CachePlus(BF).run();
+
+  // Verify correctness
+  assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
+  assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout");
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp
index fb8f2cbcf2c2..4d15572110f4 100644
--- a/bolt/Passes/HFSortPlus.cpp
+++ b/bolt/Passes/HFSortPlus.cpp
@@ -29,11 +29,8 @@
 
 #include "BinaryFunction.h"
 #include "HFSort.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
+#include "ReorderUtils.h"
 #include "llvm/Support/Options.h"
-#include "llvm/Support/raw_ostream.h"
 
 #include <vector>
 #include <unordered_map>
@@ -48,21 +45,9 @@ using namespace bolt;
 namespace opts {
 
 extern cl::OptionCategory BoltOptCategory;
-extern cl::opt<bool> Verbosity;
 
-cl::opt<unsigned>
-ITLBPageSizeParam("itlb-page-size",
-  cl::desc("The size of i-tlb cache page"),
-  cl::init(4096),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
-cl::opt<unsigned>
-ITLBEntriesParam("itlb-entries",
-  cl::desc("The number of entries in i-tlb cache"),
-  cl::init(16),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+extern cl::opt<unsigned> ITLBPageSize;
+extern cl::opt<unsigned> ITLBEntries;
 
 cl::opt<double>
 MergeProbability("merge-probability",
@@ -92,189 +77,46 @@ int32_t ITLBPageSize;
 // while smaller values result in better i-cache performance
 int32_t ITLBEntries;
 
-// This class maintains adjacency information for all Clusters being
-// processed.  It is used to invalidate cache entries when merging
-// Clusters and for visiting all neighbors of any given Cluster.
-class AdjacencyMatrix {
- public:
-  AdjacencyMatrix(const CallGraph &Cg,
-                  std::vector<Cluster *> &Clusters,
-                  const std::vector<Cluster *> &FuncCluster)
-  : Clusters(Clusters),
-    Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) {
-    initialize(Cg, FuncCluster);
-  }
-
-  template <typename F>
-  void forallAdjacent(const Cluster *C, F Func) const {
-    const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
-  }
-
-  template <typename F>
-  void forallAdjacent(const Cluster *C, F Func) {
-    for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) {
-      Func(Clusters[I]);
-    }
-  }
-
-  // Merge adjacency info from cluster B into cluster A.  Info for cluster B is left
-  // in an undefined state.
-  void merge(const Cluster *A, const Cluster *B) {
-    Bits[A->id()] |= Bits[B->id()];
-    Bits[A->id()][A->id()] = false;
-    Bits[A->id()][B->id()] = false;
-    Bits[B->id()][A->id()] = false;
-    for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) {
-      Bits[I][A->id()] = true;
-      Bits[I][B->id()] = false;
-    }
-  }
-
-  void dump(const Cluster *A) const {
-    outs() << "Cluster " << A->id() << ":";
-    forallAdjacent(A, [](const Cluster *B) { outs() << " " << B->id(); });
-  }
-
-  void dump() const {
-    for (auto *A : Clusters) {
-      if (!A) continue;
-      dump(A);
-      outs() << "\n";
-    }
-  }
- private:
-  void set(const Cluster *A, const Cluster *B, bool Value) {
-    assert(A != B);
-    Bits[A->id()][B->id()] = Value;
-    Bits[B->id()][A->id()] = Value;
-  }
-
-  void initialize(const CallGraph &Cg, const std::vector<Cluster *> &FuncCluster) {
-    for (auto *A : Clusters) {
-      for (auto TargetId : A->targets()) {
-        for (auto Succ : Cg.successors(TargetId)) {
-          auto *B = FuncCluster[Succ];
-          if (!B || B == A) continue;
-          const auto &Arc = *Cg.findArc(TargetId, Succ);
-          if (Arc.weight() <= 0.0) continue;
-
-          set(A, B, true);
-        }
-        for (auto Pred : Cg.predecessors(TargetId)) {
-          auto *B = FuncCluster[Pred];
-          if (!B || B == A) continue;
-          const auto &Arc = *Cg.findArc(Pred, TargetId);
-          if (Arc.weight() <= 0.0) continue;
-
-          set(A, B, true);
-        }
-      }
-    }
-  }
-
-  std::vector<Cluster *> Clusters;
-  std::vector<BitVector> Bits;
-};
-
-// A cache of precomputed results for a pair of clusters
-class PrecomputedResults {
- public:
-  explicit PrecomputedResults(size_t Size)
-  : Size(Size),
-    Cache(new double[Size*Size]),
-    Valid(Size * Size, false) {
-    memset(Cache, 0, sizeof(double)*Size*Size);
-  }
-  ~PrecomputedResults() {
-    delete[] Cache;
-  }
-
-  bool contains(const Cluster *First, const Cluster *Second) const {
-    return Valid[index(First, Second)];
-  }
-
-  double get(const Cluster *First, const Cluster *Second) const {
-    assert(contains(First, Second));
-    return Cache[index(First, Second)];
-  }
-
-  void set(const Cluster *First, const Cluster *Second, double Value) {
-    const auto Index = index(First, Second);
-    Cache[Index] = Value;
-    Valid[Index] = true;
-  }
-
-  void invalidate(const Cluster *C) {
-    Valid.reset(C->id() * Size, (C->id() + 1) * Size);
-    for (size_t Id = 0; Id < Size; Id++) {
-      Valid.reset(Id * Size + C->id());
-    }
-  }
-
- private:
-  size_t index(const Cluster *First, const Cluster *Second) const {
-    return First->id() * Size + Second->id();
-  }
-
-  size_t Size;
-  double *Cache;
-  BitVector Valid;
-};
-
-/*
- * Erase an element from a container if it is present.  Otherwise, do nothing.
- */
-template <typename C, typename V>
-void maybeErase(C &Container, const V& Value) {
-  auto Itr = Container.find(Value);
-  if (Itr != Container.end())
-    Container.erase(Itr);
-}
-
-/*
- * Density of a cluster formed by merging a given pair of clusters
- */
+/// Density of a cluster formed by merging a given pair of clusters.
 double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) {
   const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples();
   const double CombinedSize = ClusterPred->size() + ClusterSucc->size();
   return CombinedSamples / CombinedSize;
 }
 
-/*
- * Deterministically compare clusters by their density in decreasing order.
- */
+/// Deterministically compare clusters by density in decreasing order.
 bool compareClusters(const Cluster *C1, const Cluster *C2) {
   const double D1 = C1->density();
   const double D2 = C2->density();
-  if (D1 != D2) return D1 > D2;
+  if (D1 != D2)
+    return D1 > D2;
   // making sure the sorting is deterministic
-  if (C1->size() != C2->size()) return C1->size() < C2->size();
-  if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
+  if (C1->size() != C2->size())
+    return C1->size() < C2->size();
+  if (C1->samples() != C2->samples())
+    return C1->samples() > C2->samples();
   return C1->target(0) < C2->target(0);
 }
 
-/*
- * Deterministically compare pairs of clusters by their density
- * in decreasing order.
- */
+/// Deterministically compare pairs of clusters by density in decreasing order.
 bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
                          const Cluster *A2, const Cluster *B2) {
   const auto D1 = density(A1, B1);
   const auto D2 = density(A2, B2);
-  if (D1 != D2) return D1 > D2;
-  // making sure the sorting is deterministic
+  if (D1 != D2)
+    return D1 > D2;
   const auto Size1 = A1->size() + B1->size();
   const auto Size2 = A2->size() + B2->size();
-  if (Size1 != Size2) return Size1 < Size2;
+  if (Size1 != Size2)
+    return Size1 < Size2;
   const auto Samples1 = A1->samples() + B1->samples();
   const auto Samples2 = A2->samples() + B2->samples();
-  if (Samples1 != Samples2) return Samples1 > Samples2;
+  if (Samples1 != Samples2)
+    return Samples1 > Samples2;
   return A1->target(0) < A2->target(0);
 }
 
-/*
- * Sorting clusters by their density in decreasing order
- */
+/// Sorting clusters by their density in decreasing order.
 template <typename C>
 std::vector<Cluster *> sortByDensity(const C &Clusters_) {
   std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
@@ -282,27 +124,23 @@ std::vector<Cluster *> sortByDensity(const C &Clusters_) {
   return Clusters;
 }
 
-/*
- * HFSortPlus - layout of hot functions with iTLB cache optimization
- *
- * Given an ordering of hot functions (and hence, their assignment to the
- * iTLB pages), we can divide all functions calls into two categories:
- * - 'short' ones that have a caller-callee distance less than a page;
- * - 'long' ones where the distance exceeds a page.
- * The short calls are likely to result in a iTLB cache hit. For the long ones,
- * the hit/miss result depends on the 'hotness' of the page (i.e., how often
- * the page is accessed). Assuming that functions are sent to the iTLB cache
- * in a random order, the probability that a page is present in the cache is
- * proportional to the number of samples corresponding to the functions on the
- * page. The following algorithm detects short and long calls, and optimizes
- * the expected number of cache misses for the long ones.
- */
+/// HFSortPlus - layout of hot functions with iTLB cache optimization
+///
+/// Given an ordering of hot functions (and hence, their assignment to the
+/// iTLB pages), we can divide all functions calls into two categories:
+/// - 'short' ones that have a caller-callee distance less than a page;
+/// - 'long' ones where the distance exceeds a page.
+/// The short calls are likely to result in a iTLB cache hit. For the long ones,
+/// the hit/miss result depends on the 'hotness' of the page (i.e., how often
+/// the page is accessed). Assuming that functions are sent to the iTLB cache
+/// in a random order, the probability that a page is present in the cache is
+/// proportional to the number of samples corresponding to the functions on the
+/// page. The following algorithm detects short and long calls, and optimizes
+/// the expected number of cache misses for the long ones.
 class HFSortPlus {
 public:
-  /*
-   * The expected number of calls on different i-TLB pages for an arc of the
-   * call graph with a specified weight
-   */
+  /// The expected number of calls on different i-TLB pages for an arc of the
+  /// call graph with a specified weight
   double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double Weight) const {
     const auto Dist = std::abs(SrcAddr - DstAddr);
     if (Dist > ITLBPageSize)
@@ -313,15 +151,13 @@ public:
     return (1.0 - X * X) * Weight;
   }
 
-  /*
-   * The probability that a page with a given weight is not present in the cache
-   *
-   * Assume that the hot functions are called in a random order; then the
-   * probability of a i-TLB page being accessed after a function call is
-   * p=pageSamples/totalSamples. The probability that the page is not accessed
-   * is (1-p), and the probability that it is not in the cache (i.e. not accessed
-   * during the last ITLBEntries function calls) is (1-p)^ITLBEntries
-   */
+  /// The probability that a page with a given weight is not present in the cache
+  ///
+  /// Assume that the hot functions are called in a random order; then the
+  /// probability of a i-TLB page being accessed after a function call is
+  /// p=pageSamples/totalSamples. The probability that the page is not accessed
+  /// is (1-p), and the probability that it is not in the cache (i.e. not accessed
+  /// during the last ITLBEntries function calls) is (1-p)^ITLBEntries
   double missProbability(double PageSamples) const {
     double P = PageSamples / TotalSamples;
     double X = ITLBEntries;
@@ -330,10 +166,8 @@ public:
     return pow(1.0 - P, X);
   }
 
-  /*
-   * The expected number of calls within a given cluster with both endpoints on
-   * the same cache page
-   */
+  /// The expected number of calls within a given cluster with both endpoints on
+  /// the same cache page
   double shortCalls(const Cluster *Cluster) const {
     double Calls = 0;
     for (auto TargetId : Cluster->targets()) {
@@ -352,10 +186,8 @@ public:
     return Calls;
   }
 
-  /*
-   * The number of calls between the two clusters with both endpoints on
-   * the same i-TLB page, assuming that a given pair of clusters gets merged
-   */
+  /// The number of calls between the two clusters with both endpoints on
+  /// the same i-TLB page, assuming that a given pair of clusters gets merged
   double shortCalls(const Cluster *ClusterPred,
                     const Cluster *ClusterSucc) const {
     double Calls = 0;
@@ -389,18 +221,16 @@ public:
     return Calls;
   }
 
-  /*
-   * The gain of merging two clusters.
-   *
-   * We assume that the final clusters are sorted by their density, and hence
-   * every cluster is likely to be adjacent with clusters of the same density.
-   * Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
-   * which is used to compute the probability of cache misses for long calls
-   * of a given cluster.
-   * The result is also scaled by the size of the resulting cluster in order to
-   * increse the chance of merging short clusters, which is helpful for
-   * the i-cache performance.
-   */
+  /// The gain of merging two clusters.
+  ///
+  /// We assume that the final clusters are sorted by their density, and hence
+  /// every cluster is likely to be adjacent with clusters of the same density.
+  /// Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
+  /// which is used to compute the probability of cache misses for long calls
+  /// of a given cluster.
+  /// The result is also scaled by the size of the resulting cluster in order to
+  /// increse the chance of merging short clusters, which is helpful for
+  /// the i-cache performance.
   double mergeGain(const Cluster *ClusterPred,
                    const Cluster *ClusterSucc) const {
     if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) {
@@ -435,9 +265,7 @@ public:
     return Gain;
   }
 
-  /*
-   * For every active cluster, compute its total weight of outgoing edges
-   */
+  /// For every active cluster, compute its total weight of outgoing edges
   std::unordered_map<Cluster *, double> computeOutgoingWeight() {
     std::unordered_map<Cluster *, double> OutWeight;
     for (auto ClusterPred : Clusters) {
@@ -456,9 +284,7 @@ public:
     return OutWeight;
   }
 
-  /*
-   * Find pairs of clusters that call each other with high probability
-   */
+  /// Find pairs of clusters that call each other with high probability
   std::vector<std::pair<Cluster *, Cluster *>> findClustersToMerge() {
     // compute total weight of outgoing edges for every cluster
     auto OutWeight = computeOutgoingWeight();
@@ -503,10 +329,8 @@ public:
     return PairsToMerge;
   }
 
-  /*
-   * Run the first optimization pass of the hfsort+ algorithm:
-   * Merge clusters that call each other with high probability
-   */
+  /// Run the first optimization pass of the hfsort+ algorithm:
+  /// Merge clusters that call each other with high probability
   void runPassOne() {
     while (Clusters.size() > 1) {
       // pairs of clusters that will be merged on this iteration
@@ -523,11 +347,9 @@ public:
     }
   }
 
-  /*
-   * Run the second optimization pass of the hfsort+ algorithm:
-   * Merge pairs of clusters while there is an improvement in the
-   * expected cache miss ratio
-   */
+  /// Run the second optimization pass of the hfsort+ algorithm:
+  /// Merge pairs of clusters while there is an improvement in the
+  /// expected cache miss ratio
   void runPassTwo() {
     while (Clusters.size() > 1) {
       Cluster *BestClusterPred = nullptr;
@@ -535,7 +357,7 @@ public:
       double BestGain = -1;
       for (auto ClusterPred : Clusters) {
         // get candidates for merging with the current cluster
-        Adjacent.forallAdjacent(
+        Adjacent.forAllAdjacent(
           ClusterPred,
           // find the best candidate
           [&](Cluster *ClusterSucc) {
@@ -565,9 +387,7 @@ public:
     }
   }
 
-  /*
-   * Run hfsort+ algorithm and return ordered set of function clusters.
-   */
+  /// Run hfsort+ algorithm and return ordered set of function clusters.
   std::vector<Cluster> run() {
     DEBUG(dbgs() << "Starting hfsort+ w/"
                  << (UseGainCache ? "gain cache" : "no cache")
@@ -602,19 +422,37 @@ public:
     Addr(Cg.numNodes(), InvalidAddr),
     TotalSamples(0.0),
     Clusters(initializeClusters()),
-    Adjacent(Cg, Clusters, FuncCluster),
+    Adjacent(Cg.numNodes()),
     UseGainCache(UseGainCache),
     GainCache(Clusters.size()) {
+    // Initialize adjacency matrix
+    Adjacent.initialize(Clusters);
+    for (auto *A : Clusters) {
+      for (auto TargetId : A->targets()) {
+        for (auto Succ : Cg.successors(TargetId)) {
+          auto *B = FuncCluster[Succ];
+          if (!B || B == A) continue;
+          const auto &Arc = *Cg.findArc(TargetId, Succ);
+          if (Arc.weight() > 0.0)
+            Adjacent.set(A, B);
+        }
+        for (auto Pred : Cg.predecessors(TargetId)) {
+          auto *B = FuncCluster[Pred];
+          if (!B || B == A) continue;
+          const auto &Arc = *Cg.findArc(Pred, TargetId);
+          if (Arc.weight() > 0.0)
+            Adjacent.set(A, B);
+        }
+      }
+    }
   }
-private:
 
-  /*
-   * Initialize the set of active clusters, function id to cluster mapping,
-   * total number of samples and function addresses.
-   */
+private:
+  /// Initialize the set of active clusters, function id to cluster mapping,
+  /// total number of samples and function addresses.
   std::vector<Cluster *> initializeClusters() {
-    ITLBPageSize = opts::ITLBPageSizeParam;
-    ITLBEntries = opts::ITLBEntriesParam;
+    ITLBPageSize = opts::ITLBPageSize;
+    ITLBEntries = opts::ITLBEntries;
 
     // Initialize clusters
     std::vector<Cluster *> Clusters;
@@ -632,16 +470,8 @@ private:
     return Clusters;
   }
 
-  /*
-   * Merge cluster From into cluster Into and update the list of active clusters
-   */
+  /// Merge cluster From into cluster Into and update the list of active clusters
   void mergeClusters(Cluster *Into, Cluster *From) {
-    DEBUG(
-      if (opts::Verbosity > 0) {
-        dbgs() << "Merging cluster " << From->id()
-               << " into cluster " << Into->id() << "\n";
-      });
-
     // The adjacency merge must happen before the Cluster::merge since that
     // clobbers the contents of From.
     Adjacent.merge(Into, From);
@@ -690,7 +520,7 @@ private:
   std::vector<Cluster *> Clusters;
 
   // Cluster adjacency matrix
-  AdjacencyMatrix Adjacent;
+  AdjacencyMatrix<Cluster> Adjacent;
 
   // Use cache for mergeGain results
   bool UseGainCache;
@@ -699,10 +529,10 @@ private:
   // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
   // containing both x and y and all clusters adjacent to x and y (and recompute
   // them on the next iteration).
-  mutable PrecomputedResults GainCache;
+  mutable ClusterPairCache<Cluster, double> GainCache;
 };
 
-}
+} // end namespace anonymous
 
 std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache) {
   // It is required that the sum of incoming arc weights is not greater
diff --git a/bolt/Passes/ReorderAlgorithm.h b/bolt/Passes/ReorderAlgorithm.h
index fd50a6c311e5..5be8a93f6f1f 100644
--- a/bolt/Passes/ReorderAlgorithm.h
+++ b/bolt/Passes/ReorderAlgorithm.h
@@ -243,6 +243,16 @@ public:
       const BinaryFunction &BF, BasicBlockOrder &Order) const override;
 };
 
+/// A new reordering algorithm for basic blocks, cache+
+class CachePlusReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit CachePlusReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
 
 /// Toy example that simply reverses the original basic block order.
 class ReverseReorderAlgorithm : public ReorderAlgorithm {
diff --git a/bolt/Passes/ReorderUtils.h b/bolt/Passes/ReorderUtils.h
new file mode 100644
index 000000000000..15c68d65e635
--- /dev/null
+++ b/bolt/Passes/ReorderUtils.h
@@ -0,0 +1,112 @@
+// Passes/ReorderUtils.h - Helper methods for function and block reordering   //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H
+
+#include <memory>
+#include <vector>
+
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+namespace bolt {
+
+// This class maintains adjacency information for all Clusters being
+// processed. It is used for visiting all neighbors of any given Cluster
+// while merging pairs of Clusters. Every Cluster must implement the id() method
+template <typename Cluster> class AdjacencyMatrix {
+public:
+  explicit AdjacencyMatrix(size_t Size) : Bits(Size, BitVector(Size, false)) {}
+
+  void initialize(std::vector<Cluster *> &_Clusters) { Clusters = _Clusters; }
+
+  template <typename F> void forAllAdjacent(const Cluster *C, F Func) const {
+    const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
+  }
+
+  template <typename F> void forAllAdjacent(const Cluster *C, F Func) {
+    for (auto I = Bits[C->id()].find_first(); I != -1;
+         I = Bits[C->id()].find_next(I)) {
+      Func(Clusters[I]);
+    }
+  }
+
+  /// Merge adjacency info from cluster B into cluster A.  Info for cluster B is
+  /// left in an undefined state.
+  void merge(const Cluster *A, const Cluster *B) {
+    Bits[A->id()] |= Bits[B->id()];
+    Bits[A->id()][A->id()] = false;
+    Bits[A->id()][B->id()] = false;
+    Bits[B->id()][A->id()] = false;
+    for (auto I = Bits[B->id()].find_first(); I != -1;
+         I = Bits[B->id()].find_next(I)) {
+      Bits[I][A->id()] = true;
+      Bits[I][B->id()] = false;
+    }
+  }
+
+  void set(const Cluster *A, const Cluster *B) { set(A, B, true); }
+
+private:
+  void set(const Cluster *A, const Cluster *B, bool Value) {
+    assert(A != B);
+    Bits[A->id()][B->id()] = Value;
+    Bits[B->id()][A->id()] = Value;
+  }
+
+  std::vector<Cluster *> Clusters;
+  std::vector<BitVector> Bits;
+};
+
+// This class holds cached results of specified type for a pair of Clusters.
+// It can invalidate all cache entries associated with a given Cluster.
+template <typename Cluster, typename ValueType> class ClusterPairCache {
+public:
+  explicit ClusterPairCache(size_t Size)
+      : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
+
+  bool contains(const Cluster *First, const Cluster *Second) const {
+    return Valid[index(First, Second)];
+  }
+
+  ValueType get(const Cluster *First, const Cluster *Second) const {
+    assert(contains(First, Second));
+    return Cache[index(First, Second)];
+  }
+
+  void set(const Cluster *First, const Cluster *Second, ValueType Value) {
+    const auto Index = index(First, Second);
+    Cache[Index] = Value;
+    Valid[Index] = true;
+  }
+
+  void invalidate(const Cluster *C) {
+    Valid.reset(C->id() * Size, (C->id() + 1) * Size);
+    for (size_t id = 0; id < Size; id++) {
+      Valid.reset((id * Size) + C->id());
+    }
+  }
+
+private:
+  size_t index(const Cluster *First, const Cluster *Second) const {
+    return (First->id() * Size) + Second->id();
+  }
+
+  size_t Size;
+  std::vector<ValueType> Cache;
+  BitVector Valid;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif