forked from OSchip/llvm-project
Run hfsort+ in parallel
Summary: hfsort+ performs an expensive analysis to determine the new order of the functions. 99% of the time during hfsort+ is spent in the function runPassTwo. This diff runs the body of the hot loop in runPassTwo in parallel speeding up the total runtime of reorder-functions pass by up to 4x (cherry picked from FBD16450780)
This commit is contained in:
parent
a9b9aa1e02
commit
6443c46b9d
|
@ -11,6 +11,7 @@
|
||||||
|
|
||||||
#include "BinaryFunction.h"
|
#include "BinaryFunction.h"
|
||||||
#include "HFSort.h"
|
#include "HFSort.h"
|
||||||
|
#include "ParallelUtilities.h"
|
||||||
#include "ReorderUtils.h"
|
#include "ReorderUtils.h"
|
||||||
#include "llvm/Support/Options.h"
|
#include "llvm/Support/Options.h"
|
||||||
|
|
||||||
|
@ -319,50 +320,115 @@ public:
|
||||||
/// Merge pairs of clusters while there is an improvement in the
|
/// Merge pairs of clusters while there is an improvement in the
|
||||||
/// expected cache miss ratio
|
/// expected cache miss ratio
|
||||||
void runPassTwo() {
|
void runPassTwo() {
|
||||||
|
// BucketsCount is hard-coded to make the algorithm determinestic regardless
|
||||||
|
// of the number of threads
|
||||||
|
const unsigned BucketsCount = 124;
|
||||||
|
unsigned IterationCount = 0;
|
||||||
|
|
||||||
|
llvm::ThreadPool *Pool;
|
||||||
|
if (!opts::NoThreads)
|
||||||
|
Pool = &ParallelUtilities::getThreadPool();
|
||||||
|
|
||||||
while (Clusters.size() > 1) {
|
while (Clusters.size() > 1) {
|
||||||
Cluster *BestClusterPred = nullptr;
|
MergeCandidateEntry GlobalMaximum;
|
||||||
Cluster *BestClusterSucc = nullptr;
|
std::vector<MergeCandidateEntry> LocalMaximums(BucketsCount);
|
||||||
double BestGain = -1;
|
|
||||||
for (auto ClusterPred : Clusters) {
|
// Compare two candidates with a given gain
|
||||||
// get candidates for merging with the current cluster
|
auto compareCandidates = [](const MergeCandidateEntry &CandidateA,
|
||||||
|
const MergeCandidateEntry &CandidateB) {
|
||||||
|
// breaking ties by density to make the hottest clusters be
|
||||||
|
// merged first
|
||||||
|
return CandidateA.Gain > CandidateB.Gain ||
|
||||||
|
(std::abs(CandidateA.Gain - CandidateB.Gain) < 1e-8 &&
|
||||||
|
compareClusterPairs(
|
||||||
|
CandidateA.ClusterPred, CandidateA.ClusterSucc,
|
||||||
|
CandidateB.ClusterPred, CandidateB.ClusterSucc));
|
||||||
|
};
|
||||||
|
|
||||||
|
// find the best candidates to merge within a bucket range
|
||||||
|
auto findMaximaInBucket = [&](const unsigned Start, const unsigned End,
|
||||||
|
const unsigned BucketId) {
|
||||||
|
auto &LocalMaximum = LocalMaximums[BucketId];
|
||||||
|
|
||||||
|
for (unsigned Idx = Start; Idx < End; Idx++) {
|
||||||
|
if (Idx >= Clusters.size())
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto ClusterPred = Clusters[Idx];
|
||||||
|
|
||||||
|
// get best candidates to merge with the current cluster
|
||||||
Adjacent.forAllAdjacent(
|
Adjacent.forAllAdjacent(
|
||||||
ClusterPred,
|
ClusterPred,
|
||||||
// find the best candidate
|
// find the best candidate
|
||||||
[&](Cluster *ClusterSucc) {
|
[&](Cluster *ClusterSucc) {
|
||||||
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
assert(ClusterPred != ClusterSucc &&
|
||||||
|
"loop edges are not supported");
|
||||||
|
|
||||||
// compute the gain of merging two clusters
|
// compute the gain of merging two clusters
|
||||||
const double Gain = mergeGain(ClusterPred, ClusterSucc);
|
const double Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||||
|
|
||||||
// breaking ties by density to make the hottest clusters be merged first
|
// create a new candidate
|
||||||
if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
|
MergeCandidateEntry Candidate;
|
||||||
compareClusterPairs(ClusterPred,
|
Candidate.Gain = Gain;
|
||||||
ClusterSucc,
|
Candidate.ClusterPred = ClusterPred;
|
||||||
BestClusterPred,
|
Candidate.ClusterSucc = ClusterSucc;
|
||||||
BestClusterSucc))) {
|
|
||||||
BestGain = Gain;
|
if (compareCandidates(Candidate, LocalMaximum))
|
||||||
BestClusterPred = ClusterPred;
|
LocalMaximum = Candidate;
|
||||||
BestClusterSucc = ClusterSucc;
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// stop merging when there is no improvement
|
unsigned BucketSize = Clusters.size() / BucketsCount;
|
||||||
if (BestGain <= 0.0)
|
if (Clusters.size() % BucketsCount)
|
||||||
|
BucketSize++;
|
||||||
|
|
||||||
|
// find the best candidate within each bucket
|
||||||
|
unsigned BucketId = 0;
|
||||||
|
for (unsigned ClusterIdx = 0; ClusterIdx < Clusters.size();
|
||||||
|
ClusterIdx += BucketSize, BucketId++) {
|
||||||
|
|
||||||
|
if (opts::NoThreads) {
|
||||||
|
findMaximaInBucket(ClusterIdx, ClusterIdx + BucketSize, BucketId);
|
||||||
|
} else {
|
||||||
|
Pool->async(findMaximaInBucket, ClusterIdx, ClusterIdx + BucketSize,
|
||||||
|
BucketId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!opts::NoThreads)
|
||||||
|
Pool->wait();
|
||||||
|
|
||||||
|
// find glabal maximum
|
||||||
|
for (auto &LocalMaximum : LocalMaximums) {
|
||||||
|
if (LocalMaximum.Gain > 0 &&
|
||||||
|
compareCandidates(LocalMaximum, GlobalMaximum))
|
||||||
|
GlobalMaximum = LocalMaximum;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (GlobalMaximum.Gain <= 0.0)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// merge the best pair of clusters
|
DEBUG(outs() << "merging##" << GlobalMaximum.ClusterPred->id() << "##"
|
||||||
mergeClusters(BestClusterPred, BestClusterSucc);
|
<< GlobalMaximum.ClusterSucc->id() << "@@"
|
||||||
|
<< GlobalMaximum.Gain << "\n");
|
||||||
|
|
||||||
|
mergeClusters(GlobalMaximum.ClusterPred, GlobalMaximum.ClusterSucc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEBUG(outs() << "BOLT-INFO: hfsort+ pass two finished in" << IterationCount
|
||||||
|
<< " iterations.");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run hfsort+ algorithm and return ordered set of function clusters.
|
/// Run hfsort+ algorithm and return ordered set of function clusters.
|
||||||
std::vector<Cluster> run() {
|
std::vector<Cluster> run() {
|
||||||
DEBUG(dbgs() << "Starting hfsort+ w/"
|
DEBUG(dbgs() << "Starting hfsort+ w/"
|
||||||
<< (UseGainCache ? "gain cache" : "no cache")
|
<< (UseGainCache ? "gain cache" : "no cache") << " for "
|
||||||
<< " for " << Clusters.size() << " clusters "
|
<< Clusters.size() << " clusters "
|
||||||
<< "with ITLBPageSize = " << ITLBPageSize << ", "
|
<< "with ITLBPageSize = " << ITLBPageSize << ", "
|
||||||
<< "ITLBEntries = " << ITLBEntries << ", "
|
<< "ITLBEntries = " << ITLBEntries << ", "
|
||||||
<< "and MergeProbability = " << opts::MergeProbability << "\n");
|
<< "and MergeProbability = " << opts::MergeProbability
|
||||||
|
<< "\n");
|
||||||
|
|
||||||
// Pass 1
|
// Pass 1
|
||||||
runPassOne();
|
runPassOne();
|
||||||
|
@ -370,7 +436,8 @@ public:
|
||||||
// Pass 2
|
// Pass 2
|
||||||
runPassTwo();
|
runPassTwo();
|
||||||
|
|
||||||
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
|
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size()
|
||||||
|
<< " clusters\n");
|
||||||
|
|
||||||
// Sorting clusters by density in decreasing order
|
// Sorting clusters by density in decreasing order
|
||||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||||
|
@ -418,6 +485,13 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/// A struct that is used to store a merge candidate
|
||||||
|
struct MergeCandidateEntry {
|
||||||
|
double Gain{-1};
|
||||||
|
Cluster *ClusterPred{nullptr};
|
||||||
|
Cluster *ClusterSucc{nullptr};
|
||||||
|
};
|
||||||
|
|
||||||
/// Initialize the set of active clusters, function id to cluster mapping,
|
/// Initialize the set of active clusters, function id to cluster mapping,
|
||||||
/// total number of samples and function addresses.
|
/// total number of samples and function addresses.
|
||||||
std::vector<Cluster *> initializeClusters() {
|
std::vector<Cluster *> initializeClusters() {
|
||||||
|
@ -502,7 +576,7 @@ private:
|
||||||
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
|
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
|
||||||
// containing both x and y and all clusters adjacent to x and y (and recompute
|
// containing both x and y and all clusters adjacent to x and y (and recompute
|
||||||
// them on the next iteration).
|
// them on the next iteration).
|
||||||
mutable ClusterPairCache<Cluster, double> GainCache;
|
mutable ClusterPairCacheThreadSafe<Cluster, double> GainCache;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // end namespace anonymous
|
} // end namespace anonymous
|
||||||
|
|
|
@ -106,6 +106,49 @@ private:
|
||||||
BitVector Valid;
|
BitVector Valid;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// This class holds cached results of specified type for a pair of Clusters.
|
||||||
|
// It can invalidate all cache entries associated with a given Cluster.
|
||||||
|
// The functions set, get and contains are thread safe when called with
|
||||||
|
// distinct keys.
|
||||||
|
template <typename Cluster, typename ValueType>
|
||||||
|
class ClusterPairCacheThreadSafe {
|
||||||
|
public:
|
||||||
|
explicit ClusterPairCacheThreadSafe(size_t Size)
|
||||||
|
: Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
|
||||||
|
|
||||||
|
bool contains(const Cluster *First, const Cluster *Second) const {
|
||||||
|
return Valid[index(First, Second)];
|
||||||
|
}
|
||||||
|
|
||||||
|
ValueType get(const Cluster *First, const Cluster *Second) const {
|
||||||
|
assert(contains(First, Second));
|
||||||
|
return Cache[index(First, Second)];
|
||||||
|
}
|
||||||
|
|
||||||
|
void set(const Cluster *First, const Cluster *Second, ValueType Value) {
|
||||||
|
const auto Index = index(First, Second);
|
||||||
|
Cache[Index] = Value;
|
||||||
|
Valid[Index] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void invalidate(const Cluster *C) {
|
||||||
|
for (size_t idx = C->id() * Size; idx < (C->id() + 1) * Size; idx++)
|
||||||
|
Valid[idx] = false;
|
||||||
|
|
||||||
|
for (size_t id = 0; id < Size; id++)
|
||||||
|
Valid[(id * Size) + C->id()] = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t Size;
|
||||||
|
std::vector<ValueType> Cache;
|
||||||
|
std::vector<ValueType> Valid;
|
||||||
|
|
||||||
|
size_t index(const Cluster *First, const Cluster *Second) const {
|
||||||
|
return (First->id() * Size) + Second->id();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace bolt
|
} // namespace bolt
|
||||||
} // namespace llvm
|
} // namespace llvm
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue