Run hfsort+ in parallel

Summary: hfsort+ performs an expensive analysis to determine the new order of the functions. 99% of the time during hfsort+ is spent in the function runPassTwo. This diff runs the body of the hot loop in runPassTwo in parallel speeding up the total runtime of reorder-functions pass by up to 4x (cherry picked from FBD16450780)
2019-07-23 15:49:02 -07:00 · 2019-07-23 15:49:02 -07:00 · 6443c46b9d
parent a9b9aa1e02
commit 6443c46b9d
2 changed files with 150 additions and 33 deletions
--- a/bolt/src/Passes/HFSortPlus.cpp
+++ b/bolt/src/Passes/HFSortPlus.cpp
@ -11,6 +11,7 @@

 #include "BinaryFunction.h"
 #include "HFSort.h"
+#include "ParallelUtilities.h"
 #include "ReorderUtils.h"
 #include "llvm/Support/Options.h"

@ -319,50 +320,115 @@ public:
  /// Merge pairs of clusters while there is an improvement in the
  /// expected cache miss ratio
  void runPassTwo() {
-    while (Clusters.size() > 1) {
-      Cluster *BestClusterPred = nullptr;
-      Cluster *BestClusterSucc = nullptr;
-      double BestGain = -1;
-      for (auto ClusterPred : Clusters) {
-        // get candidates for merging with the current cluster
-        Adjacent.forAllAdjacent(
-          ClusterPred,
-          // find the best candidate
-          [&](Cluster *ClusterSucc) {
-            assert(ClusterPred != ClusterSucc && "loop edges are not supported");
-            // compute the gain of merging two clusters
-            const double Gain = mergeGain(ClusterPred, ClusterSucc);
+    // BucketsCount is hard-coded to make the algorithm determinestic regardless
+    // of the number of threads
+    const unsigned BucketsCount = 124;
+    unsigned IterationCount = 0;

-            // breaking ties by density to make the hottest clusters be merged first
-            if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
-                                    compareClusterPairs(ClusterPred,
-                                                        ClusterSucc,
-                                                        BestClusterPred,
-                                                        BestClusterSucc))) {
-              BestGain = Gain;
-              BestClusterPred = ClusterPred;
-              BestClusterSucc = ClusterSucc;
-            }
-          });
+    llvm::ThreadPool *Pool;
+    if (!opts::NoThreads)
+      Pool = &ParallelUtilities::getThreadPool();
+
+    while (Clusters.size() > 1) {
+      MergeCandidateEntry GlobalMaximum;
+      std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
+
+      // Compare two candidates with a given gain
+      auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
+                                  const MergeCandidateEntry &CandidateB) {
+        // breaking ties by density to make the hottest clusters be
+        // merged first
+        return CandidateA.Gain > CandidateB.Gain ||
+               (std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
+                compareClusterPairs(
+                    CandidateA.ClusterPred, CandidateA.ClusterSucc,
+                    CandidateB.ClusterPred, CandidateB.ClusterSucc));
+      };
+
+      // find the best candidates to merge within a bucket range
+      auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
+                                    const unsigned BucketId) {
+        auto &LocalMaximum = LocalMaximums[BucketId];
+
+        for (unsigned Idx = Start; Idx < End; Idx++) {
+          if (Idx >= Clusters.size())
+            return;
+
+          auto ClusterPred = Clusters[Idx];
+
+          // get best candidates to merge with the current cluster
+          Adjacent.forAllAdjacent(
+              ClusterPred,
+              // find the best candidate
+              [&](Cluster *ClusterSucc) {
+                assert(ClusterPred != ClusterSucc &&
+                       "loop edges are not supported");
+
+                // compute the gain of merging two clusters
+                const double Gain = mergeGain(ClusterPred, ClusterSucc);
+
+                // create a new candidate
+                MergeCandidateEntry Candidate;
+                Candidate.Gain = Gain;
+                Candidate.ClusterPred = ClusterPred;
+                Candidate.ClusterSucc = ClusterSucc;
+
+                if (compareCandidates(Candidate, LocalMaximum))
+                  LocalMaximum = Candidate;
+              });
+        }
+      };
+
+      unsigned BucketSize = Clusters.size() / BucketsCount;
+      if (Clusters.size() % BucketsCount)
+        BucketSize++;
+
+      // find the best candidate within each bucket
+      unsigned BucketId = 0;
+      for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
+           ClusterIdx += BucketSize, BucketId++) {
+
+        if (opts::NoThreads) {
+          findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
+        } else {
+          Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
+                      BucketId);
+        }
      }

-      // stop merging when there is no improvement
-      if (BestGain <= 0.0)
+      if (!opts::NoThreads)
+        Pool->wait();
+
+      // find glabal maximum
+      for (auto &LocalMaximum : LocalMaximums) {
+        if (LocalMaximum.Gain > 0 &&
+            compareCandidates(LocalMaximum, GlobalMaximum))
+          GlobalMaximum = LocalMaximum;
+      }
+
+      if (GlobalMaximum.Gain <= 0.0)
        break;

-      // merge the best pair of clusters
-      mergeClusters(BestClusterPred, BestClusterSucc);
+      DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
+                   << GlobalMaximum.ClusterSucc->id() << "@@"
+                   << GlobalMaximum.Gain << "\n");
+
+      mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
    }
+
+    DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
+                 << " iterations.");
  }

  /// Run hfsort+ algorithm and return ordered set of function clusters.
  std::vector<Cluster> run() {
    DEBUG(dbgs() << "Starting hfsort+ w/"
-                 << (UseGainCache ? "gain cache" : "no cache")
-                 << " for " << Clusters.size() << " clusters "
+                 << (UseGainCache ? "gain cache" : "no cache") << " for "
+                 << Clusters.size() << " clusters "
                 << "with ITLBPageSize = " << ITLBPageSize << ", "
                 << "ITLBEntries = " << ITLBEntries << ", "
-                 << "and MergeProbability = " << opts::MergeProbability << "\n");
+                 << "and MergeProbability = " << opts::MergeProbability
+                 << "\n");

    // Pass 1
    runPassOne();
@ -370,7 +436,8 @@ public:
    // Pass 2
    runPassTwo();

-    DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
+    DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
+                 << " clusters\n");

    // Sorting clusters by density in decreasing order
    std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
@ -418,6 +485,13 @@ public:
  }

 private:
+  /// A struct that is used to store a merge candidate
+  struct MergeCandidateEntry {
+    double Gain{-1};
+    Cluster *ClusterPred{nullptr};
+    Cluster *ClusterSucc{nullptr};
+  };
+
  /// Initialize the set of active clusters, function id to cluster mapping,
  /// total number of samples and function addresses.
  std::vector<Cluster *> initializeClusters() {
@ -502,7 +576,7 @@ private:
  // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
  // containing both x and y and all clusters adjacent to x and y (and recompute
  // them on the next iteration).
-  mutable ClusterPairCache<Cluster, double> GainCache;
+  mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
 };

 } // end namespace anonymous
--- a/bolt/src/Passes/ReorderUtils.h
+++ b/bolt/src/Passes/ReorderUtils.h
@ -106,6 +106,49 @@ private:
  BitVector Valid;
 };

+// This class holds cached results of specified type for a pair of Clusters.
+// It can invalidate all cache entries associated with a given Cluster.
+// The functions set, get and contains are thread safe when called with
+// distinct keys.
+template <typename Cluster, typename ValueType>
+class ClusterPairCacheThreadSafe {
+public:
+  explicit ClusterPairCacheThreadSafe(size_t Size)
+      : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
+
+  bool contains(const Cluster *First, const Cluster *Second) const {
+    return Valid[index(First, Second)];
+  }
+
+  ValueType get(const Cluster *First, const Cluster *Second) const {
+    assert(contains(First, Second));
+    return Cache[index(First, Second)];
+  }
+
+  void set(const Cluster *First, const Cluster *Second, ValueType Value) {
+    const auto Index = index(First, Second);
+    Cache[Index] = Value;
+    Valid[Index] = true;
+  }
+
+  void invalidate(const Cluster *C) {
+    for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
+      Valid[idx] = false;
+
+    for (size_t id = 0; id < Size; id++)
+      Valid[(id * Size) + C->id()] = false;
+  }
+
+private:
+  size_t Size;
+  std::vector<ValueType> Cache;
+  std::vector<ValueType> Valid;
+
+  size_t index(const Cluster *First, const Cluster *Second) const {
+    return (First->id() * Size) + Second->id();
+  }
+};
+
 } // namespace bolt
 } // namespace llvm