Run hfsort+ in parallel

Summary:
hfsort+ performs an expensive analysis to determine the
new order of the functions. 99% of the time during hfsort+
is spent in the function runPassTwo. This diff runs the body
of the hot loop in runPassTwo in parallel speeding up the
total runtime of reorder-functions pass by up to 4x

(cherry picked from FBD16450780)
This commit is contained in:
laith sakka 2019-07-23 15:49:02 -07:00 committed by Maksim Panchenko
parent a9b9aa1e02
commit 6443c46b9d
2 changed files with 150 additions and 33 deletions

View File

@ -11,6 +11,7 @@
#include "BinaryFunction.h"
#include "HFSort.h"
#include "ParallelUtilities.h"
#include "ReorderUtils.h"
#include "llvm/Support/Options.h"
@ -319,50 +320,115 @@ public:
/// Merge pairs of clusters while there is an improvement in the
/// expected cache miss ratio
void runPassTwo() {
// BucketsCount is hard-coded to make the algorithm determinestic regardless
// of the number of threads
const unsigned BucketsCount = 124;
unsigned IterationCount = 0;
llvm::ThreadPool *Pool;
if (!opts::NoThreads)
Pool = &ParallelUtilities::getThreadPool();
while (Clusters.size() > 1) {
Cluster *BestClusterPred = nullptr;
Cluster *BestClusterSucc = nullptr;
double BestGain = -1;
for (auto ClusterPred : Clusters) {
// get candidates for merging with the current cluster
MergeCandidateEntry GlobalMaximum;
std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
// Compare two candidates with a given gain
auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
const MergeCandidateEntry &CandidateB) {
// breaking ties by density to make the hottest clusters be
// merged first
return CandidateA.Gain > CandidateB.Gain ||
(std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
compareClusterPairs(
CandidateA.ClusterPred, CandidateA.ClusterSucc,
CandidateB.ClusterPred, CandidateB.ClusterSucc));
};
// find the best candidates to merge within a bucket range
auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
const unsigned BucketId) {
auto &LocalMaximum = LocalMaximums[BucketId];
for (unsigned Idx = Start; Idx < End; Idx++) {
if (Idx >= Clusters.size())
return;
auto ClusterPred = Clusters[Idx];
// get best candidates to merge with the current cluster
Adjacent.forAllAdjacent(
ClusterPred,
// find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
assert(ClusterPred != ClusterSucc &&
"loop edges are not supported");
// compute the gain of merging two clusters
const double Gain = mergeGain(ClusterPred, ClusterSucc);
// breaking ties by density to make the hottest clusters be merged first
if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
compareClusterPairs(ClusterPred,
ClusterSucc,
BestClusterPred,
BestClusterSucc))) {
BestGain = Gain;
BestClusterPred = ClusterPred;
BestClusterSucc = ClusterSucc;
}
// create a new candidate
MergeCandidateEntry Candidate;
Candidate.Gain = Gain;
Candidate.ClusterPred = ClusterPred;
Candidate.ClusterSucc = ClusterSucc;
if (compareCandidates(Candidate, LocalMaximum))
LocalMaximum = Candidate;
});
}
};
// stop merging when there is no improvement
if (BestGain <= 0.0)
unsigned BucketSize = Clusters.size() / BucketsCount;
if (Clusters.size() % BucketsCount)
BucketSize++;
// find the best candidate within each bucket
unsigned BucketId = 0;
for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
ClusterIdx += BucketSize, BucketId++) {
if (opts::NoThreads) {
findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
} else {
Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
BucketId);
}
}
if (!opts::NoThreads)
Pool->wait();
// find glabal maximum
for (auto &LocalMaximum : LocalMaximums) {
if (LocalMaximum.Gain > 0 &&
compareCandidates(LocalMaximum, GlobalMaximum))
GlobalMaximum = LocalMaximum;
}
if (GlobalMaximum.Gain <= 0.0)
break;
// merge the best pair of clusters
mergeClusters(BestClusterPred, BestClusterSucc);
DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
<< GlobalMaximum.ClusterSucc->id() << "@@"
<< GlobalMaximum.Gain << "\n");
mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
}
DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
<< " iterations.");
}
/// Run hfsort+ algorithm and return ordered set of function clusters.
std::vector<Cluster> run() {
DEBUG(dbgs() << "Starting hfsort+ w/"
<< (UseGainCache ? "gain cache" : "no cache")
<< " for " << Clusters.size() << " clusters "
<< (UseGainCache ? "gain cache" : "no cache") << " for "
<< Clusters.size() << " clusters "
<< "with ITLBPageSize = " << ITLBPageSize << ", "
<< "ITLBEntries = " << ITLBEntries << ", "
<< "and MergeProbability = " << opts::MergeProbability << "\n");
<< "and MergeProbability = " << opts::MergeProbability
<< "\n");
// Pass 1
runPassOne();
@ -370,7 +436,8 @@ public:
// Pass 2
runPassTwo();
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
<< " clusters\n");
// Sorting clusters by density in decreasing order
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
@ -418,6 +485,13 @@ public:
}
private:
/// A struct that is used to store a merge candidate
struct MergeCandidateEntry {
double Gain{-1};
Cluster *ClusterPred{nullptr};
Cluster *ClusterSucc{nullptr};
};
/// Initialize the set of active clusters, function id to cluster mapping,
/// total number of samples and function addresses.
std::vector<Cluster *> initializeClusters() {
@ -502,7 +576,7 @@ private:
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
// containing both x and y and all clusters adjacent to x and y (and recompute
// them on the next iteration).
mutable ClusterPairCache<Cluster, double> GainCache;
mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
};
} // end namespace anonymous

View File

@ -106,6 +106,49 @@ private:
BitVector Valid;
};
// This class holds cached results of specified type for a pair of Clusters.
// It can invalidate all cache entries associated with a given Cluster.
// The functions set, get and contains are thread safe when called with
// distinct keys.
template <typename Cluster, typename ValueType>
class ClusterPairCacheThreadSafe {
public:
explicit ClusterPairCacheThreadSafe(size_t Size)
: Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
bool contains(const Cluster *First, const Cluster *Second) const {
return Valid[index(First, Second)];
}
ValueType get(const Cluster *First, const Cluster *Second) const {
assert(contains(First, Second));
return Cache[index(First, Second)];
}
void set(const Cluster *First, const Cluster *Second, ValueType Value) {
const auto Index = index(First, Second);
Cache[Index] = Value;
Valid[Index] = true;
}
void invalidate(const Cluster *C) {
for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
Valid[idx] = false;
for (size_t id = 0; id < Size; id++)
Valid[(id * Size) + C->id()] = false;
}
private:
size_t Size;
std::vector<ValueType> Cache;
std::vector<ValueType> Valid;
size_t index(const Cluster *First, const Cluster *Second) const {
return (First->id() * Size) + Second->id();
}
};
} // namespace bolt
} // namespace llvm