Run hfsort+ in parallel

Summary: hfsort+ performs an expensive analysis to determine the new order of the functions. 99% of the time during hfsort+ is spent in the function runPassTwo. This diff runs the body of the hot loop in runPassTwo in parallel speeding up the total runtime of reorder-functions pass by up to 4x (cherry picked from FBD16450780)
2019-07-23 15:49:02 -07:00 · 2019-07-23 15:49:02 -07:00 · 6443c46b9d
parent a9b9aa1e02
commit 6443c46b9d
2 changed files with 150 additions and 33 deletions
--- a/bolt/src/Passes/HFSortPlus.cpp
+++ b/bolt/src/Passes/HFSortPlus.cpp
@ -11,6 +11,7 @@
 #include "BinaryFunction.h"
 #include "HFSort.h"
 #include "ParallelUtilities.h"
 #include "ReorderUtils.h"
 #include "llvm/Support/Options.h"
@ -319,50 +320,115 @@ public:
  /// Merge pairs of clusters while there is an improvement in the
  /// expected cache miss ratio
  void runPassTwo() {
    // BucketsCount is hard-coded to make the algorithm determinestic regardless
    // of the number of threads
    const unsigned BucketsCount = 124;
    unsigned IterationCount = 0;
    llvm::ThreadPool *Pool;
    if (!opts::NoThreads)
      Pool = &ParallelUtilities::getThreadPool();
    while (Clusters.size() > 1) {
-      Cluster *BestClusterPred = nullptr;
+      MergeCandidateEntry GlobalMaximum;
-      Cluster *BestClusterSucc = nullptr;
+      std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
-      double BestGain = -1;
+
-      for (auto ClusterPred : Clusters) {
+      // Compare two candidates with a given gain
-        // get candidates for merging with the current cluster
+      auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
                                  const MergeCandidateEntry &CandidateB) {
        // breaking ties by density to make the hottest clusters be
        // merged first
        return CandidateA.Gain > CandidateB.Gain ||
               (std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
                compareClusterPairs(
                    CandidateA.ClusterPred, CandidateA.ClusterSucc,
                    CandidateB.ClusterPred, CandidateB.ClusterSucc));
      };
      // find the best candidates to merge within a bucket range
      auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
                                    const unsigned BucketId) {
        auto &LocalMaximum = LocalMaximums[BucketId];
        for (unsigned Idx = Start; Idx < End; Idx++) {
          if (Idx >= Clusters.size())
            return;
          auto ClusterPred = Clusters[Idx];
          // get best candidates to merge with the current cluster
          Adjacent.forAllAdjacent(
              ClusterPred,
              // find the best candidate
              [&](Cluster *ClusterSucc) {
-            assert(ClusterPred != ClusterSucc && "loop edges are not supported");
+                assert(ClusterPred != ClusterSucc &&
                       "loop edges are not supported");
                // compute the gain of merging two clusters
                const double Gain = mergeGain(ClusterPred, ClusterSucc);
-            // breaking ties by density to make the hottest clusters be merged first
+                // create a new candidate
-            if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
+                MergeCandidateEntry Candidate;
-                                    compareClusterPairs(ClusterPred,
+                Candidate.Gain = Gain;
-                                                        ClusterSucc,
+                Candidate.ClusterPred = ClusterPred;
-                                                        BestClusterPred,
+                Candidate.ClusterSucc = ClusterSucc;
-                                                        BestClusterSucc))) {
+
-              BestGain = Gain;
+                if (compareCandidates(Candidate, LocalMaximum))
-              BestClusterPred = ClusterPred;
+                  LocalMaximum = Candidate;
              BestClusterSucc = ClusterSucc;
            }
              });
        }
      };
-      // stop merging when there is no improvement
+      unsigned BucketSize = Clusters.size() / BucketsCount;
-      if (BestGain <= 0.0)
+      if (Clusters.size() % BucketsCount)
        BucketSize++;
      // find the best candidate within each bucket
      unsigned BucketId = 0;
      for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
           ClusterIdx += BucketSize, BucketId++) {
        if (opts::NoThreads) {
          findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
        } else {
          Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
                      BucketId);
        }
      }
      if (!opts::NoThreads)
        Pool->wait();
      // find glabal maximum
      for (auto &LocalMaximum : LocalMaximums) {
        if (LocalMaximum.Gain > 0 &&
            compareCandidates(LocalMaximum, GlobalMaximum))
          GlobalMaximum = LocalMaximum;
      }
      if (GlobalMaximum.Gain <= 0.0)
        break;
-      // merge the best pair of clusters
+      DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
-      mergeClusters(BestClusterPred, BestClusterSucc);
+                   << GlobalMaximum.ClusterSucc->id() << "@@"
                   << GlobalMaximum.Gain << "\n");
      mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
    }
    DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
                 << " iterations.");
  }
  /// Run hfsort+ algorithm and return ordered set of function clusters.
  std::vector<Cluster> run() {
    DEBUG(dbgs() << "Starting hfsort+ w/"
-                 << (UseGainCache ? "gain cache" : "no cache")
+                 << (UseGainCache ? "gain cache" : "no cache") << " for "
-                 << " for " << Clusters.size() << " clusters "
+                 << Clusters.size() << " clusters "
                 << "with ITLBPageSize = " << ITLBPageSize << ", "
                 << "ITLBEntries = " << ITLBEntries << ", "
-                 << "and MergeProbability = " << opts::MergeProbability << "\n");
+                 << "and MergeProbability = " << opts::MergeProbability
                 << "\n");
    // Pass 1
    runPassOne();
@ -370,7 +436,8 @@ public:
    // Pass 2
    runPassTwo();
-    DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
+    DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
                 << " clusters\n");
    // Sorting clusters by density in decreasing order
    std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
@ -418,6 +485,13 @@ public:
  }
 private:
  /// A struct that is used to store a merge candidate
  struct MergeCandidateEntry {
    double Gain{-1};
    Cluster *ClusterPred{nullptr};
    Cluster *ClusterSucc{nullptr};
  };
  /// Initialize the set of active clusters, function id to cluster mapping,
  /// total number of samples and function addresses.
  std::vector<Cluster *> initializeClusters() {
@ -502,7 +576,7 @@ private:
  // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
  // containing both x and y and all clusters adjacent to x and y (and recompute
  // them on the next iteration).
-  mutable ClusterPairCache<Cluster, double> GainCache;
+  mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
 };
 } // end namespace anonymous
--- a/bolt/src/Passes/ReorderUtils.h
+++ b/bolt/src/Passes/ReorderUtils.h
@ -106,6 +106,49 @@ private:
  BitVector Valid;
 };
 // This class holds cached results of specified type for a pair of Clusters.
 // It can invalidate all cache entries associated with a given Cluster.
 // The functions set, get and contains are thread safe when called with
 // distinct keys.
 template <typename Cluster, typename ValueType>
 class ClusterPairCacheThreadSafe {
 public:
  explicit ClusterPairCacheThreadSafe(size_t Size)
      : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
  bool contains(const Cluster *First, const Cluster *Second) const {
    return Valid[index(First, Second)];
  }
  ValueType get(const Cluster *First, const Cluster *Second) const {
    assert(contains(First, Second));
    return Cache[index(First, Second)];
  }
  void set(const Cluster *First, const Cluster *Second, ValueType Value) {
    const auto Index = index(First, Second);
    Cache[Index] = Value;
    Valid[Index] = true;
  }
  void invalidate(const Cluster *C) {
    for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
      Valid[idx] = false;
    for (size_t id = 0; id < Size; id++)
      Valid[(id * Size) + C->id()] = false;
  }
 private:
  size_t Size;
  std::vector<ValueType> Cache;
  std::vector<ValueType> Valid;
  size_t index(const Cluster *First, const Cluster *Second) const {
    return (First->id() * Size) + Second->id();
  }
 };
 } // namespace bolt
 } // namespace llvm