|
|
|
@ -28,18 +28,53 @@
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "HFSort.h"
|
|
|
|
|
#include "llvm/ADT/BitVector.h"
|
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
|
#include "llvm/Support/Format.h"
|
|
|
|
|
#include "llvm/Support/Options.h"
|
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <unordered_map>
|
|
|
|
|
#include <unordered_set>
|
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
|
#include "llvm/Support/Format.h"
|
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
|
|
|
|
|
#undef DEBUG_TYPE
|
|
|
|
|
#define DEBUG_TYPE "hfsort"
|
|
|
|
|
|
|
|
|
|
namespace opts {
|
|
|
|
|
|
|
|
|
|
extern llvm::cl::OptionCategory BoltOptCategory;
|
|
|
|
|
extern llvm::cl::opt<bool> Verbosity;
|
|
|
|
|
|
|
|
|
|
static llvm::cl::opt<bool>
|
|
|
|
|
UseGainCache("hfsort+-use-cache",
|
|
|
|
|
llvm::cl::desc("Use a cache for mergeGain results when computing hfsort+."),
|
|
|
|
|
llvm::cl::ZeroOrMore,
|
|
|
|
|
llvm::cl::init(true),
|
|
|
|
|
llvm::cl::Hidden,
|
|
|
|
|
llvm::cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
|
|
static llvm::cl::opt<bool>
|
|
|
|
|
UseShortCallCache("hfsort+-use-short-call-cache",
|
|
|
|
|
llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."),
|
|
|
|
|
llvm::cl::ZeroOrMore,
|
|
|
|
|
llvm::cl::init(true),
|
|
|
|
|
llvm::cl::Hidden,
|
|
|
|
|
llvm::cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
|
|
const char* cacheKindString() {
|
|
|
|
|
if (opts::UseGainCache && opts::UseShortCallCache)
|
|
|
|
|
return "gain + short call cache";
|
|
|
|
|
else if (opts::UseGainCache)
|
|
|
|
|
return "gain cache";
|
|
|
|
|
else if (opts::UseShortCallCache)
|
|
|
|
|
return "short call cache";
|
|
|
|
|
else
|
|
|
|
|
return "no cache";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
|
namespace bolt {
|
|
|
|
|
|
|
|
|
@ -60,66 +95,136 @@ constexpr uint32_t ITLBEntries = 16;
|
|
|
|
|
|
|
|
|
|
constexpr size_t InvalidAddr = -1;
|
|
|
|
|
|
|
|
|
|
template <typename A, typename B>
|
|
|
|
|
class HashPair {
|
|
|
|
|
// This class maintains adjacency information for all Clusters being
|
|
|
|
|
// processed. It is used to invalidate cache entries when merging
|
|
|
|
|
// Clusters and for visiting all neighbors of any given Cluster.
|
|
|
|
|
class AdjacencyMatrix {
|
|
|
|
|
public:
|
|
|
|
|
size_t operator()(const std::pair<A, B> &P) const {
|
|
|
|
|
size_t Seed(0);
|
|
|
|
|
Seed = hashCombine(Seed, (int64_t)P.first);
|
|
|
|
|
Seed = hashCombine(Seed, (int64_t)P.second);
|
|
|
|
|
return Seed;
|
|
|
|
|
AdjacencyMatrix(const CallGraph &Cg,
|
|
|
|
|
std::vector<Cluster *> &Clusters,
|
|
|
|
|
const std::vector<Cluster *> &FuncCluster)
|
|
|
|
|
: Clusters(Clusters),
|
|
|
|
|
Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) {
|
|
|
|
|
initialize(Cg, FuncCluster);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename F>
|
|
|
|
|
void forallAdjacent(const Cluster *C, F Func) const {
|
|
|
|
|
const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename F>
|
|
|
|
|
void forallAdjacent(const Cluster *C, F Func) {
|
|
|
|
|
for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) {
|
|
|
|
|
Func(Clusters[I]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void merge(const Cluster *A, const Cluster *B) {
|
|
|
|
|
Bits[A->id()] |= Bits[B->id()];
|
|
|
|
|
Bits[A->id()][A->id()] = false;
|
|
|
|
|
Bits[A->id()][B->id()] = false;
|
|
|
|
|
Bits[B->id()][A->id()] = false;
|
|
|
|
|
for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) {
|
|
|
|
|
Bits[I][A->id()] = true;
|
|
|
|
|
Bits[I][B->id()] = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dump(const Cluster *A) const {
|
|
|
|
|
outs() << "Cluster " << A->id() << ":";
|
|
|
|
|
forallAdjacent(A,
|
|
|
|
|
[this,A](const Cluster *B) {
|
|
|
|
|
outs() << " " << B->id();
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dump() const {
|
|
|
|
|
for (auto *A : Clusters) {
|
|
|
|
|
if (!A) continue;
|
|
|
|
|
dump(A);
|
|
|
|
|
outs() << "\n";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
private:
|
|
|
|
|
void set(const Cluster *A, const Cluster *B, bool Value) {
|
|
|
|
|
assert(A != B);
|
|
|
|
|
Bits[A->id()][B->id()] = Value;
|
|
|
|
|
Bits[B->id()][A->id()] = Value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void initialize(const CallGraph &Cg, const std::vector<Cluster *> &FuncCluster) {
|
|
|
|
|
for (auto *A : Clusters) {
|
|
|
|
|
for (auto TargetId : A->targets()) {
|
|
|
|
|
for (auto Succ : Cg.successors(TargetId)) {
|
|
|
|
|
auto *B = FuncCluster[Succ];
|
|
|
|
|
if (!B || B == A) continue;
|
|
|
|
|
set(A, B, true);
|
|
|
|
|
}
|
|
|
|
|
for (auto Pred : Cg.predecessors(TargetId)) {
|
|
|
|
|
auto *B = FuncCluster[Pred];
|
|
|
|
|
if (!B || B == A) continue;
|
|
|
|
|
set(A, B, true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Cluster *> Clusters;
|
|
|
|
|
std::vector<BitVector> Bits;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// A cache of precomputed results for a pair of clusters
|
|
|
|
|
class PrecomputedResults {
|
|
|
|
|
public:
|
|
|
|
|
PrecomputedResults() {}
|
|
|
|
|
|
|
|
|
|
bool contains(Cluster *First, Cluster *Second) const {
|
|
|
|
|
if (InvalidKeys.count(First) || InvalidKeys.count(Second)) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
const auto Key = std::make_pair(First, Second);
|
|
|
|
|
return Cache.find(Key) != Cache.end();
|
|
|
|
|
explicit PrecomputedResults(size_t Size)
|
|
|
|
|
: Size(Size),
|
|
|
|
|
Cache(new double[Size*Size]),
|
|
|
|
|
Valid(Size * Size, false) {
|
|
|
|
|
memset(Cache, 0, sizeof(double)*Size*Size);
|
|
|
|
|
}
|
|
|
|
|
~PrecomputedResults() {
|
|
|
|
|
delete[] Cache;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double get(Cluster *First, Cluster *Second) const {
|
|
|
|
|
bool contains(const Cluster *First, const Cluster *Second) const {
|
|
|
|
|
return Valid[index(First, Second)];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double get(const Cluster *First, const Cluster *Second) const {
|
|
|
|
|
assert(contains(First, Second));
|
|
|
|
|
const auto Key = std::make_pair(First, Second); // TODO: use min/max?
|
|
|
|
|
return Cache.find(Key)->second;
|
|
|
|
|
return Cache[index(First, Second)];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set(Cluster *First, Cluster *Second, double Value) {
|
|
|
|
|
const auto Key = std::make_pair(First, Second);
|
|
|
|
|
Cache[Key] = Value;
|
|
|
|
|
validate(First);
|
|
|
|
|
validate(Second);
|
|
|
|
|
void set(const Cluster *First, const Cluster *Second, double Value) {
|
|
|
|
|
const auto Index = index(First, Second);
|
|
|
|
|
Cache[Index] = Value;
|
|
|
|
|
Valid[Index] = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void validate(Cluster *C) {
|
|
|
|
|
auto Itr = InvalidKeys.find(C);
|
|
|
|
|
if (Itr != InvalidKeys.end())
|
|
|
|
|
InvalidKeys.erase(Itr);
|
|
|
|
|
void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) {
|
|
|
|
|
invalidate(C);
|
|
|
|
|
Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void validateAll() {
|
|
|
|
|
InvalidKeys.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void invalidate(Cluster *Cluster) {
|
|
|
|
|
InvalidKeys.insert(Cluster);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
std::unordered_map<std::pair<Cluster *, Cluster *>,
|
|
|
|
|
double,
|
|
|
|
|
HashPair<Cluster *,Cluster *>> Cache;
|
|
|
|
|
std::unordered_set<Cluster *> InvalidKeys;
|
|
|
|
|
void invalidate(const Cluster *C) {
|
|
|
|
|
Valid.reset(C->id() * Size, (C->id() + 1) * Size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t index(const Cluster *First, const Cluster *Second) const {
|
|
|
|
|
return (First->id() * Size) + Second->id();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t Size;
|
|
|
|
|
double *Cache;
|
|
|
|
|
BitVector Valid;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// A wrapper for algorthm-wide variables
|
|
|
|
|
// A wrapper for algorithm-wide variables
|
|
|
|
|
struct AlgoState {
|
|
|
|
|
explicit AlgoState(size_t Size)
|
|
|
|
|
: Cache(Size), ShortCallPairCache(Size) { }
|
|
|
|
|
|
|
|
|
|
// the call graph
|
|
|
|
|
const CallGraph *Cg;
|
|
|
|
|
// the total number of samples in the graph
|
|
|
|
@ -130,42 +235,72 @@ struct AlgoState {
|
|
|
|
|
std::vector<size_t> Addr;
|
|
|
|
|
// maximum cluster id.
|
|
|
|
|
size_t MaxClusterId{0};
|
|
|
|
|
// A cache that keeps precomputed values of mergeGain for pairs of clusters;
|
|
|
|
|
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
|
|
|
|
|
// containing both x and y (and recompute them on the next iteration)
|
|
|
|
|
PrecomputedResults Cache;
|
|
|
|
|
// Cache for shortCalls for a single cluster.
|
|
|
|
|
std::unordered_map<const Cluster *, double> ShortCallCache;
|
|
|
|
|
// Cache for shortCalls for a pair of Clusters
|
|
|
|
|
PrecomputedResults ShortCallPairCache;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Sorting clusters by their density in decreasing order
|
|
|
|
|
*/
|
|
|
|
|
void sortByDensity(std::vector<Cluster *> &Clusters) {
|
|
|
|
|
std::stable_sort(
|
|
|
|
|
Clusters.begin(),
|
|
|
|
|
Clusters.end(),
|
|
|
|
|
[&] (const Cluster *C1, const Cluster *C2) {
|
|
|
|
|
const double D1 = C1->density();
|
|
|
|
|
const double D2 = C2->density();
|
|
|
|
|
// making sure the sorting is deterministic
|
|
|
|
|
if (D1 != D2) return D1 > D2;
|
|
|
|
|
if (C1->size() != C2->size()) return C1->size() < C2->size();
|
|
|
|
|
if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
|
|
|
|
|
return C1->target(0) < C2->target(0);
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Density of a cluster formed by merging a given pair of clusters
|
|
|
|
|
*/
|
|
|
|
|
double density(Cluster *ClusterPred, Cluster *ClusterSucc) {
|
|
|
|
|
double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) {
|
|
|
|
|
const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples();
|
|
|
|
|
const double CombinedSize = ClusterPred->size() + ClusterSucc->size();
|
|
|
|
|
return CombinedSamples / CombinedSize;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Deterministically compare clusters by their density in decreasing order.
|
|
|
|
|
*/
|
|
|
|
|
bool compareClusters(const Cluster *C1, const Cluster *C2) {
|
|
|
|
|
const double D1 = C1->density();
|
|
|
|
|
const double D2 = C2->density();
|
|
|
|
|
// making sure the sorting is deterministic
|
|
|
|
|
if (D1 != D2) return D1 > D2;
|
|
|
|
|
if (C1->size() != C2->size()) return C1->size() < C2->size();
|
|
|
|
|
if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
|
|
|
|
|
return C1->target(0) < C2->target(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Deterministically compare pairs of clusters by their density
|
|
|
|
|
* in decreasing order.
|
|
|
|
|
*/
|
|
|
|
|
bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
|
|
|
|
|
const Cluster *A2, const Cluster *B2) {
|
|
|
|
|
const auto D1 = density(A1, B1);
|
|
|
|
|
const auto D2 = density(A2, B2);
|
|
|
|
|
if (D1 != D2) return D1 > D2;
|
|
|
|
|
const auto Size1 = A1->size() + B1->size();
|
|
|
|
|
const auto Size2 = A2->size() + B2->size();
|
|
|
|
|
if (Size1 != Size2) return Size1 < Size2;
|
|
|
|
|
const auto Samples1 = A1->samples() + B1->samples();
|
|
|
|
|
const auto Samples2 = A2->samples() + B2->samples();
|
|
|
|
|
if (Samples1 != Samples2) return Samples1 > Samples2;
|
|
|
|
|
return A1->target(0) < A2->target(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Sorting clusters by their density in decreasing order
|
|
|
|
|
*/
|
|
|
|
|
template <typename C>
|
|
|
|
|
std::vector<Cluster *> sortByDensity(const C &Clusters_) {
|
|
|
|
|
std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
|
|
|
|
|
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
|
|
|
|
return Clusters;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The probability that a page with a given weight is not present in the cache.
|
|
|
|
|
*
|
|
|
|
|
* Assume that the hot function are called in a random order; then the
|
|
|
|
|
* Assume that the hot functions are called in a random order; then the
|
|
|
|
|
* probability of a TLB page being accessed after a function call is
|
|
|
|
|
* p=pageSamples/totalSamples. The probability that the page is not accessed
|
|
|
|
|
* is (1-p), and the probability that it is not in the cache (i.e. not accessed
|
|
|
|
@ -194,11 +329,10 @@ double missProbability(const AlgoState &State, double PageSamples) {
|
|
|
|
|
* page. The following procedure detects short and long calls, and estimates
|
|
|
|
|
* the expected number of cache misses for the long ones.
|
|
|
|
|
*/
|
|
|
|
|
double expectedCacheHitRatio(const AlgoState &State,
|
|
|
|
|
const std::vector<Cluster *> &Clusters_) {
|
|
|
|
|
// copy and sort by density
|
|
|
|
|
std::vector<Cluster *> Clusters(Clusters_);
|
|
|
|
|
sortByDensity(Clusters);
|
|
|
|
|
template <typename C>
|
|
|
|
|
double expectedCacheHitRatio(const AlgoState &State, const C &Clusters_) {
|
|
|
|
|
// sort by density
|
|
|
|
|
std::vector<Cluster *> Clusters(sortByDensity(Clusters_));
|
|
|
|
|
|
|
|
|
|
// generate function addresses with an alignment
|
|
|
|
|
std::vector<size_t> Addr(State.Cg->numNodes(), InvalidAddr);
|
|
|
|
@ -247,35 +381,6 @@ double expectedCacheHitRatio(const AlgoState &State,
|
|
|
|
|
return 100.0 * (1.0 - Misses / State.TotalSamples);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Get adjacent clusters (the ones that share an arc) with the given one
|
|
|
|
|
*/
|
|
|
|
|
std::vector<Cluster *> adjacentClusters(const AlgoState &State, Cluster *C) {
|
|
|
|
|
std::vector<Cluster *> Result;
|
|
|
|
|
Result.reserve(State.MaxClusterId);
|
|
|
|
|
for (auto TargetId : C->targets()) {
|
|
|
|
|
for (auto Succ : State.Cg->successors(TargetId)) {
|
|
|
|
|
auto SuccCluster = State.FuncCluster[Succ];
|
|
|
|
|
if (SuccCluster != nullptr && SuccCluster != C) {
|
|
|
|
|
Result.push_back(SuccCluster);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (auto Pred : State.Cg->predecessors(TargetId)) {
|
|
|
|
|
auto PredCluster = State.FuncCluster[Pred];
|
|
|
|
|
if (PredCluster != nullptr && PredCluster != C) {
|
|
|
|
|
Result.push_back(PredCluster);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
std::sort(Result.begin(), Result.end(),
|
|
|
|
|
[](const Cluster *A, const Cluster *B) {
|
|
|
|
|
return A->id() < B->id();
|
|
|
|
|
});
|
|
|
|
|
auto Last = std::unique(Result.begin(), Result.end());
|
|
|
|
|
Result.erase(Last, Result.end());
|
|
|
|
|
return Result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The expected number of calls for an edge withing the same TLB page
|
|
|
|
|
*/
|
|
|
|
@ -291,7 +396,13 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) {
|
|
|
|
|
* The expected number of calls within a given cluster with both endpoints on
|
|
|
|
|
* the same TLB cache page
|
|
|
|
|
*/
|
|
|
|
|
double shortCalls(const AlgoState &State, Cluster *Cluster) {
|
|
|
|
|
double shortCalls(AlgoState &State, const Cluster *Cluster) {
|
|
|
|
|
if (opts::UseShortCallCache) {
|
|
|
|
|
auto Itr = State.ShortCallCache.find(Cluster);
|
|
|
|
|
if (Itr != State.ShortCallCache.end())
|
|
|
|
|
return Itr->second;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double Calls = 0;
|
|
|
|
|
for (auto TargetId : Cluster->targets()) {
|
|
|
|
|
for (auto Succ : State.Cg->successors(TargetId)) {
|
|
|
|
@ -306,6 +417,10 @@ double shortCalls(const AlgoState &State, Cluster *Cluster) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (opts::UseShortCallCache) {
|
|
|
|
|
State.ShortCallCache[Cluster] = Calls;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Calls;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -313,9 +428,14 @@ double shortCalls(const AlgoState &State, Cluster *Cluster) {
|
|
|
|
|
* The number of calls between the two clusters with both endpoints on
|
|
|
|
|
* the same TLB page, assuming that a given pair of clusters gets merged
|
|
|
|
|
*/
|
|
|
|
|
double shortCalls(const AlgoState &State,
|
|
|
|
|
Cluster *ClusterPred,
|
|
|
|
|
Cluster *ClusterSucc) {
|
|
|
|
|
double shortCalls(AlgoState &State,
|
|
|
|
|
const Cluster *ClusterPred,
|
|
|
|
|
const Cluster *ClusterSucc) {
|
|
|
|
|
if (opts::UseShortCallCache &&
|
|
|
|
|
State.ShortCallPairCache.contains(ClusterPred, ClusterSucc)) {
|
|
|
|
|
return State.ShortCallPairCache.get(ClusterPred, ClusterSucc);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double Calls = 0;
|
|
|
|
|
for (auto TargetId : ClusterPred->targets()) {
|
|
|
|
|
for (auto Succ : State.Cg->successors(TargetId)) {
|
|
|
|
@ -344,6 +464,10 @@ double shortCalls(const AlgoState &State,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (opts::UseShortCallCache) {
|
|
|
|
|
State.ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Calls;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -359,9 +483,13 @@ double shortCalls(const AlgoState &State,
|
|
|
|
|
* increse the chance of merging short clusters, which is helpful for
|
|
|
|
|
* the i-cache performance.
|
|
|
|
|
*/
|
|
|
|
|
double mergeGain(const AlgoState &State,
|
|
|
|
|
Cluster *ClusterPred,
|
|
|
|
|
Cluster *ClusterSucc) {
|
|
|
|
|
double mergeGain(AlgoState &State,
|
|
|
|
|
const Cluster *ClusterPred,
|
|
|
|
|
const Cluster *ClusterSucc) {
|
|
|
|
|
if (opts::UseGainCache && State.Cache.contains(ClusterPred, ClusterSucc)) {
|
|
|
|
|
return State.Cache.get(ClusterPred, ClusterSucc);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// cache misses on the first cluster
|
|
|
|
|
double LongCallsPred = ClusterPred->samples() - shortCalls(State, ClusterPred);
|
|
|
|
|
double ProbPred = missProbability(State, ClusterPred->density() * PageSize);
|
|
|
|
@ -381,7 +509,20 @@ double mergeGain(const AlgoState &State,
|
|
|
|
|
|
|
|
|
|
double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew;
|
|
|
|
|
// scaling the result to increase the importance of merging short clusters
|
|
|
|
|
return Gain / (ClusterPred->size() + ClusterSucc->size());
|
|
|
|
|
Gain /= (ClusterPred->size() + ClusterSucc->size());
|
|
|
|
|
|
|
|
|
|
if (opts::UseGainCache) {
|
|
|
|
|
State.Cache.set(ClusterPred, ClusterSucc, Gain);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Gain;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename C, typename V>
|
|
|
|
|
void maybeErase(C &Container, const V& Value) {
|
|
|
|
|
auto Itr = Container.find(Value);
|
|
|
|
|
if (Itr != Container.end())
|
|
|
|
|
Container.erase(Itr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
@ -393,37 +534,35 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
|
|
|
|
|
AllClusters.reserve(Cg.numNodes());
|
|
|
|
|
for (NodeId F = 0; F < Cg.numNodes(); F++) {
|
|
|
|
|
AllClusters.emplace_back(F, Cg.getNode(F));
|
|
|
|
|
AllClusters.back().setId(F);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// initialize objects used by the algorithm
|
|
|
|
|
std::vector<Cluster *> Clusters;
|
|
|
|
|
Clusters.reserve(Cg.numNodes());
|
|
|
|
|
AlgoState State;
|
|
|
|
|
AlgoState State(AllClusters.size()); // TODO: should use final Clusters.size()
|
|
|
|
|
State.Cg = &Cg;
|
|
|
|
|
State.TotalSamples = 0;
|
|
|
|
|
State.FuncCluster = std::vector<Cluster *>(Cg.numNodes(), nullptr);
|
|
|
|
|
State.Addr = std::vector<size_t>(Cg.numNodes(), InvalidAddr);
|
|
|
|
|
if (!AllClusters.empty()) {
|
|
|
|
|
State.MaxClusterId = AllClusters.back().id();
|
|
|
|
|
}
|
|
|
|
|
State.Addr = std::vector<size_t>(Cg.numNodes(), InvalidAddr);
|
|
|
|
|
uint32_t Id = 0;
|
|
|
|
|
for (NodeId F = 0; F < Cg.numNodes(); F++) {
|
|
|
|
|
if (Cg.samples(F) == 0) continue;
|
|
|
|
|
Clusters.push_back(&AllClusters[F]);
|
|
|
|
|
Clusters.back()->setId(Id);
|
|
|
|
|
State.FuncCluster[F] = &AllClusters[F];
|
|
|
|
|
State.Addr[F] = 0;
|
|
|
|
|
State.TotalSamples += Cg.samples(F);
|
|
|
|
|
++Id;
|
|
|
|
|
}
|
|
|
|
|
State.MaxClusterId = Id;
|
|
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n"
|
|
|
|
|
AdjacencyMatrix Adjacent(Cg, Clusters, State.FuncCluster);
|
|
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "Starting hfsort+ w/" << opts::cacheKindString() << " for "
|
|
|
|
|
<< Clusters.size() << " clusters\n"
|
|
|
|
|
<< format("Initial expected iTLB cache hit ratio: %.4lf\n",
|
|
|
|
|
expectedCacheHitRatio(State, Clusters)));
|
|
|
|
|
|
|
|
|
|
// the cache keeps precomputed values of mergeGain for pairs of clusters;
|
|
|
|
|
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
|
|
|
|
|
// containing both x and y (and recompute them on the next iteration)
|
|
|
|
|
PrecomputedResults Cache;
|
|
|
|
|
|
|
|
|
|
int Steps = 0;
|
|
|
|
|
// merge pairs of clusters while there is an improvement
|
|
|
|
|
while (Clusters.size() > 1) {
|
|
|
|
@ -435,44 +574,46 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
|
|
|
|
|
expectedCacheHitRatio(State, Clusters));
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
Steps++;
|
|
|
|
|
++Steps;
|
|
|
|
|
|
|
|
|
|
Cluster *BestClusterPred = nullptr;
|
|
|
|
|
Cluster *BestClusterSucc = nullptr;
|
|
|
|
|
double BestGain = -1;
|
|
|
|
|
for (auto ClusterPred : Clusters) {
|
|
|
|
|
// get candidates for merging with the current cluster
|
|
|
|
|
auto CandidateClusters = adjacentClusters(State, ClusterPred);
|
|
|
|
|
Adjacent.forallAdjacent(
|
|
|
|
|
ClusterPred,
|
|
|
|
|
// find the best candidate
|
|
|
|
|
[&](Cluster *ClusterSucc) {
|
|
|
|
|
assert(ClusterPred != ClusterSucc);
|
|
|
|
|
// get a cost of merging two clusters
|
|
|
|
|
const double Gain = mergeGain(State, ClusterPred, ClusterSucc);
|
|
|
|
|
|
|
|
|
|
// find the best candidate
|
|
|
|
|
for (auto ClusterSucc : CandidateClusters) {
|
|
|
|
|
// get a cost of merging two clusters
|
|
|
|
|
if (!Cache.contains(ClusterPred, ClusterSucc)) {
|
|
|
|
|
double Value = mergeGain(State, ClusterPred, ClusterSucc);
|
|
|
|
|
Cache.set(ClusterPred, ClusterSucc, Value);
|
|
|
|
|
assert(Cache.contains(ClusterPred, ClusterSucc));
|
|
|
|
|
// breaking ties by density to make the hottest clusters be merged first
|
|
|
|
|
if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
|
|
|
|
|
compareClusterPairs(ClusterPred,
|
|
|
|
|
ClusterSucc,
|
|
|
|
|
BestClusterPred,
|
|
|
|
|
BestClusterSucc))) {
|
|
|
|
|
BestGain = Gain;
|
|
|
|
|
BestClusterPred = ClusterPred;
|
|
|
|
|
BestClusterSucc = ClusterSucc;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double Gain = Cache.get(ClusterPred, ClusterSucc);
|
|
|
|
|
// breaking ties by density to make the hottest clusters be merged first
|
|
|
|
|
if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 &&
|
|
|
|
|
density(ClusterPred, ClusterSucc) >
|
|
|
|
|
density(BestClusterPred, BestClusterSucc))) {
|
|
|
|
|
BestGain = Gain;
|
|
|
|
|
BestClusterPred = ClusterPred;
|
|
|
|
|
BestClusterSucc = ClusterSucc;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
Cache.validateAll();
|
|
|
|
|
|
|
|
|
|
if (BestGain <= 0.0) break;
|
|
|
|
|
|
|
|
|
|
Cache.invalidate(BestClusterPred);
|
|
|
|
|
Cache.invalidate(BestClusterSucc);
|
|
|
|
|
|
|
|
|
|
// merge the best pair of clusters
|
|
|
|
|
BestClusterPred->merge(std::move(*BestClusterSucc));
|
|
|
|
|
DEBUG(
|
|
|
|
|
if (opts::Verbosity > 0) {
|
|
|
|
|
dbgs() << "Merging cluster " << BestClusterSucc->id()
|
|
|
|
|
<< " into cluster " << BestClusterPred->id() << "\n";
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
Adjacent.merge(BestClusterPred, BestClusterSucc);
|
|
|
|
|
BestClusterPred->merge(*BestClusterSucc);
|
|
|
|
|
|
|
|
|
|
size_t CurAddr = 0;
|
|
|
|
|
for (auto TargetId : BestClusterPred->targets()) {
|
|
|
|
@ -481,6 +622,18 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
|
|
|
|
|
CurAddr += State.Cg->size(TargetId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (opts::UseShortCallCache) {
|
|
|
|
|
maybeErase(State.ShortCallCache, BestClusterPred);
|
|
|
|
|
Adjacent.forallAdjacent(BestClusterPred,
|
|
|
|
|
[&State](const Cluster *C) {
|
|
|
|
|
maybeErase(State.ShortCallCache, C);
|
|
|
|
|
});
|
|
|
|
|
State.ShortCallPairCache.invalidate(Adjacent, BestClusterPred);
|
|
|
|
|
}
|
|
|
|
|
if (opts::UseGainCache) {
|
|
|
|
|
State.Cache.invalidate(Adjacent, BestClusterPred);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// remove BestClusterSucc from the list of active clusters
|
|
|
|
|
auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc);
|
|
|
|
|
Clusters.erase(Iter, Clusters.end());
|
|
|
|
@ -492,9 +645,8 @@ std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
|
|
|
|
|
|
|
|
|
|
// Return the set of clusters that are left, which are the ones that
|
|
|
|
|
// didn't get merged (so their first func is its original func).
|
|
|
|
|
sortByDensity(Clusters);
|
|
|
|
|
std::vector<Cluster> Result;
|
|
|
|
|
for (auto Cluster : Clusters) {
|
|
|
|
|
for (auto Cluster : sortByDensity(Clusters)) {
|
|
|
|
|
Result.emplace_back(std::move(*Cluster));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|