forked from OSchip/llvm-project
Cache+ speed, reduce mallocs
Summary: Speed of cache+ by skipping mallocs on vectors. Although this change speeds up the algorithm by 2x, this is still not enough for some binaries where some functions have ~2500 hot basic blocks. Hence, introduce a threshold for expensive optimizations in CachePlusReorderAlgorithm. If the number of hot basic blocks exceeds the threshold (2048 by default), we use a cheaper version, which is quite fast. (cherry picked from FBD6928075)
This commit is contained in:
parent
5599c01911
commit
e15623058e
|
@ -116,6 +116,8 @@ double calcExtTSPScore(
|
||||||
|
|
||||||
double Score = 0.0;
|
double Score = 0.0;
|
||||||
for (auto BF : BinaryFunctions) {
|
for (auto BF : BinaryFunctions) {
|
||||||
|
if (!BF->hasProfile())
|
||||||
|
continue;
|
||||||
for (auto SrcBB : BF->layout()) {
|
for (auto SrcBB : BF->layout()) {
|
||||||
auto BI = SrcBB->branch_info_begin();
|
auto BI = SrcBB->branch_info_begin();
|
||||||
for (auto DstBB : SrcBB->successors()) {
|
for (auto DstBB : SrcBB->successors()) {
|
||||||
|
|
|
@ -14,11 +14,25 @@
|
||||||
#include "CacheMetrics.h"
|
#include "CacheMetrics.h"
|
||||||
#include "ReorderAlgorithm.h"
|
#include "ReorderAlgorithm.h"
|
||||||
#include "ReorderUtils.h"
|
#include "ReorderUtils.h"
|
||||||
|
#include "llvm/Support/Options.h"
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
using namespace bolt;
|
using namespace bolt;
|
||||||
using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
|
using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
|
||||||
|
|
||||||
|
namespace opts {
|
||||||
|
|
||||||
|
extern cl::OptionCategory BoltOptCategory;
|
||||||
|
|
||||||
|
cl::opt<unsigned>
|
||||||
|
ClusterSplitThreshold("cluster-split-threshold",
|
||||||
|
cl::desc("The maximum size of a function to apply splitting of clusters"),
|
||||||
|
cl::init(2048),
|
||||||
|
cl::ZeroOrMore,
|
||||||
|
cl::cat(BoltOptCategory));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
namespace bolt {
|
namespace bolt {
|
||||||
|
|
||||||
|
@ -88,6 +102,59 @@ private:
|
||||||
double Score;
|
double Score;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ClusterIter = std::vector<BinaryBasicBlock *>::const_iterator;
|
||||||
|
|
||||||
|
// A wrapper around three clusters of basic blocks; it is used to avoid extra
|
||||||
|
// instantiation of the vectors.
|
||||||
|
class MergedCluster {
|
||||||
|
public:
|
||||||
|
MergedCluster(ClusterIter Begin1,
|
||||||
|
ClusterIter End1,
|
||||||
|
ClusterIter Begin2,
|
||||||
|
ClusterIter End2,
|
||||||
|
ClusterIter Begin3,
|
||||||
|
ClusterIter End3)
|
||||||
|
: Begin1(Begin1),
|
||||||
|
End1(End1),
|
||||||
|
Begin2(Begin2),
|
||||||
|
End2(End2),
|
||||||
|
Begin3(Begin3),
|
||||||
|
End3(End3) {}
|
||||||
|
|
||||||
|
template<typename F>
|
||||||
|
void forEach(const F &Func) const {
|
||||||
|
for (auto It = Begin1; It != End1; It++)
|
||||||
|
Func(*It);
|
||||||
|
for (auto It = Begin2; It != End2; It++)
|
||||||
|
Func(*It);
|
||||||
|
for (auto It = Begin3; It != End3; It++)
|
||||||
|
Func(*It);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<BinaryBasicBlock *> getBlocks() const {
|
||||||
|
std::vector<BinaryBasicBlock *> Result;
|
||||||
|
Result.reserve(std::distance(Begin1, End1) +
|
||||||
|
std::distance(Begin2, End2) +
|
||||||
|
std::distance(Begin3, End3));
|
||||||
|
Result.insert(Result.end(), Begin1, End1);
|
||||||
|
Result.insert(Result.end(), Begin2, End2);
|
||||||
|
Result.insert(Result.end(), Begin3, End3);
|
||||||
|
return Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const BinaryBasicBlock *getFirstBlock() const {
|
||||||
|
return *Begin1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
ClusterIter Begin1;
|
||||||
|
ClusterIter End1;
|
||||||
|
ClusterIter Begin2;
|
||||||
|
ClusterIter End2;
|
||||||
|
ClusterIter Begin3;
|
||||||
|
ClusterIter End3;
|
||||||
|
};
|
||||||
|
|
||||||
/// Deterministically compare clusters by their density in decreasing order
|
/// Deterministically compare clusters by their density in decreasing order
|
||||||
bool compareClusters(const Cluster *C1, const Cluster *C2) {
|
bool compareClusters(const Cluster *C1, const Cluster *C2) {
|
||||||
// original entry point to the front
|
// original entry point to the front
|
||||||
|
@ -140,8 +207,11 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
|
||||||
/// while keeping the implementation sufficiently fast.
|
/// while keeping the implementation sufficiently fast.
|
||||||
class CachePlus {
|
class CachePlus {
|
||||||
public:
|
public:
|
||||||
CachePlus(const BinaryFunction &BF)
|
CachePlus(const BinaryFunction &BF, bool UseClusterSplitting)
|
||||||
: BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) {
|
: BF(BF),
|
||||||
|
UseClusterSplitting(UseClusterSplitting),
|
||||||
|
Adjacent(BF.layout_size()),
|
||||||
|
Cache(BF.layout_size()) {
|
||||||
initialize();
|
initialize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -338,31 +408,37 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compute ExtTSP score for a given order of basic blocks
|
/// Compute ExtTSP score for a given order of basic blocks
|
||||||
double score(const std::vector<BinaryBasicBlock *>& Blocks) const {
|
double score(const MergedCluster& MergedBlocks) const {
|
||||||
uint64_t NotSet = static_cast<uint64_t>(-1);
|
uint64_t NotSet = static_cast<uint64_t>(-1);
|
||||||
auto Addr = std::vector<uint64_t>(BF.layout_size(), NotSet);
|
EstimatedAddr.assign(BF.layout_size(), NotSet);
|
||||||
|
|
||||||
uint64_t CurAddr = 0;
|
uint64_t CurAddr = 0;
|
||||||
for (auto BB : Blocks) {
|
MergedBlocks.forEach(
|
||||||
size_t Index = BB->getLayoutIndex();
|
[&](const BinaryBasicBlock *BB) {
|
||||||
Addr[Index] = CurAddr;
|
size_t Index = BB->getLayoutIndex();
|
||||||
CurAddr += Size[Index];
|
EstimatedAddr[Index] = CurAddr;
|
||||||
}
|
CurAddr += Size[Index];
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
double Score = 0;
|
double Score = 0;
|
||||||
for (auto BB : Blocks) {
|
MergedBlocks.forEach(
|
||||||
size_t Index = BB->getLayoutIndex();
|
[&](const BinaryBasicBlock *BB) {
|
||||||
for (auto Edge : OutEdges[Index]) {
|
size_t Index = BB->getLayoutIndex();
|
||||||
auto SuccBB = Edge.first;
|
for (auto Edge : OutEdges[Index]) {
|
||||||
size_t SuccIndex = SuccBB->getLayoutIndex();
|
auto SuccBB = Edge.first;
|
||||||
|
size_t SuccIndex = SuccBB->getLayoutIndex();
|
||||||
|
|
||||||
if (Addr[SuccBB->getLayoutIndex()] != NotSet) {
|
if (EstimatedAddr[SuccIndex] != NotSet) {
|
||||||
Score += CacheMetrics::extTSPScore(Addr[Index],
|
Score += CacheMetrics::extTSPScore(EstimatedAddr[Index],
|
||||||
Size[Index],
|
Size[Index],
|
||||||
Addr[SuccIndex],
|
EstimatedAddr[SuccIndex],
|
||||||
Edge.second);
|
Edge.second);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
);
|
||||||
|
|
||||||
return Score;
|
return Score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -391,7 +467,7 @@ private:
|
||||||
MergeType);
|
MergeType);
|
||||||
// Does the new cluster preserve the original entry point?
|
// Does the new cluster preserve the original entry point?
|
||||||
if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
|
if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
|
||||||
MergedBlocks[0]->getLayoutIndex() != 0)
|
MergedBlocks.getFirstBlock()->getLayoutIndex() != 0)
|
||||||
return CurGain;
|
return CurGain;
|
||||||
|
|
||||||
// The score of the new cluster
|
// The score of the new cluster
|
||||||
|
@ -405,18 +481,20 @@ private:
|
||||||
std::pair<double, size_t> Gain = std::make_pair(-1, 0);
|
std::pair<double, size_t> Gain = std::make_pair(-1, 0);
|
||||||
// Try to concatenate two clusters w/o splitting
|
// Try to concatenate two clusters w/o splitting
|
||||||
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
|
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
|
||||||
// Try to split ClusterPred into two and merge with ClusterSucc
|
if (UseClusterSplitting) {
|
||||||
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
|
// Try to split ClusterPred into two and merge with ClusterSucc
|
||||||
// Make sure the splitting does not break FT successors
|
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
|
||||||
auto BB = ClusterPred->blocks()[Offset - 1];
|
// Make sure the splitting does not break FT successors
|
||||||
if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
|
auto BB = ClusterPred->blocks()[Offset - 1];
|
||||||
assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]);
|
if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
|
||||||
continue;
|
assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]);
|
||||||
}
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t Type = 0; Type < 4; Type++) {
|
for (size_t Type = 0; Type < 4; Type++) {
|
||||||
size_t MergeType = 1 + Type + Offset * 4;
|
size_t MergeType = 1 + Type + Offset * 4;
|
||||||
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
|
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -426,29 +504,16 @@ private:
|
||||||
|
|
||||||
/// Merge two clusters (orders) of blocks according to a given 'merge type'.
|
/// Merge two clusters (orders) of blocks according to a given 'merge type'.
|
||||||
///
|
///
|
||||||
/// If MergeType == 0, then the results is a concatentation of two clusters.
|
/// If MergeType == 0, then the result is a concatentation of two clusters.
|
||||||
/// Otherwise, the first cluster is cut into two and we consider all possible
|
/// Otherwise, the first cluster is cut into two and we consider all possible
|
||||||
/// ways of concatenating three clusters.
|
/// ways of concatenating three clusters.
|
||||||
std::vector<BinaryBasicBlock *> mergeBlocks(
|
MergedCluster mergeBlocks(const std::vector<BinaryBasicBlock *> &X,
|
||||||
const std::vector<BinaryBasicBlock *> &X,
|
const std::vector<BinaryBasicBlock *> &Y,
|
||||||
const std::vector<BinaryBasicBlock *> &Y,
|
size_t MergeType) const {
|
||||||
size_t MergeType
|
|
||||||
) const {
|
|
||||||
// Concatenate three clusters of blocks in the given order
|
|
||||||
auto concat = [&](const std::vector<BinaryBasicBlock *> &A,
|
|
||||||
const std::vector<BinaryBasicBlock *> &B,
|
|
||||||
const std::vector<BinaryBasicBlock *> &C) {
|
|
||||||
std::vector<BinaryBasicBlock *> Result;
|
|
||||||
Result.reserve(A.size() + B.size() + C.size());
|
|
||||||
Result.insert(Result.end(), A.begin(), A.end());
|
|
||||||
Result.insert(Result.end(), B.begin(), B.end());
|
|
||||||
Result.insert(Result.end(), C.begin(), C.end());
|
|
||||||
return Result;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Merging w/o splitting existing clusters
|
// Merging w/o splitting existing clusters
|
||||||
if (MergeType == 0) {
|
if (MergeType == 0) {
|
||||||
return concat(X, Y, std::vector<BinaryBasicBlock *>());
|
ClusterIter Empty;
|
||||||
|
return MergedCluster(X.begin(), X.end(), Y.begin(), Y.end(), Empty, Empty);
|
||||||
}
|
}
|
||||||
|
|
||||||
MergeType--;
|
MergeType--;
|
||||||
|
@ -457,15 +522,19 @@ private:
|
||||||
assert(0 < Offset && Offset < X.size() &&
|
assert(0 < Offset && Offset < X.size() &&
|
||||||
"Invalid offset while merging clusters");
|
"Invalid offset while merging clusters");
|
||||||
// Split the first cluster, X, into X1 and X2
|
// Split the first cluster, X, into X1 and X2
|
||||||
std::vector<BinaryBasicBlock *> X1(X.begin(), X.begin() + Offset);
|
ClusterIter BeginX1 = X.begin();
|
||||||
std::vector<BinaryBasicBlock *> X2(X.begin() + Offset, X.end());
|
ClusterIter EndX1 = X.begin() + Offset;
|
||||||
|
ClusterIter BeginX2 = X.begin() + Offset;
|
||||||
|
ClusterIter EndX2 = X.end();
|
||||||
|
ClusterIter BeginY = Y.begin();
|
||||||
|
ClusterIter EndY = Y.end();
|
||||||
|
|
||||||
// Construct a new cluster from three existing ones
|
// Construct a new cluster from three existing ones
|
||||||
switch(Type) {
|
switch(Type) {
|
||||||
case 0: return concat(X1, Y, X2);
|
case 0: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
|
||||||
case 1: return concat(Y, X2, X1);
|
case 1: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
|
||||||
case 2: return concat(X2, Y, X1);
|
case 2: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1);
|
||||||
case 3: return concat(X2, X1, Y);
|
case 3: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
|
||||||
default:
|
default:
|
||||||
llvm_unreachable("unexpected merge type");
|
llvm_unreachable("unexpected merge type");
|
||||||
}
|
}
|
||||||
|
@ -479,7 +548,7 @@ private:
|
||||||
|
|
||||||
// Merge the blocks of clusters
|
// Merge the blocks of clusters
|
||||||
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
|
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
|
||||||
Into->merge(From, MergedBlocks, score(MergedBlocks));
|
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
|
||||||
|
|
||||||
// Remove cluster From from the list of active clusters
|
// Remove cluster From from the list of active clusters
|
||||||
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
|
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
|
||||||
|
@ -495,6 +564,9 @@ private:
|
||||||
// The binary function
|
// The binary function
|
||||||
const BinaryFunction &BF;
|
const BinaryFunction &BF;
|
||||||
|
|
||||||
|
// Indicates whether to use cluster splitting for optimization
|
||||||
|
bool UseClusterSplitting;
|
||||||
|
|
||||||
// All clusters
|
// All clusters
|
||||||
std::vector<Cluster> AllClusters;
|
std::vector<Cluster> AllClusters;
|
||||||
|
|
||||||
|
@ -520,6 +592,9 @@ private:
|
||||||
// containing both x and y and all clusters adjacent to x and y (and recompute
|
// containing both x and y and all clusters adjacent to x and y (and recompute
|
||||||
// them on the next iteration).
|
// them on the next iteration).
|
||||||
mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;
|
mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;
|
||||||
|
|
||||||
|
// A reusable vector used within score() method
|
||||||
|
mutable std::vector<uint64_t> EstimatedAddr;
|
||||||
};
|
};
|
||||||
|
|
||||||
void CachePlusReorderAlgorithm::reorderBasicBlocks(
|
void CachePlusReorderAlgorithm::reorderBasicBlocks(
|
||||||
|
@ -528,18 +603,14 @@ void CachePlusReorderAlgorithm::reorderBasicBlocks(
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Are there jumps with positive execution count?
|
// Are there jumps with positive execution count?
|
||||||
uint64_t SumCount = 0;
|
size_t NumHotBlocks = 0;
|
||||||
for (auto BB : BF.layout()) {
|
for (auto BB : BF.layout()) {
|
||||||
auto BI = BB->branch_info_begin();
|
if (BB->getKnownExecutionCount() > 0)
|
||||||
for (auto I : BB->successors()) {
|
NumHotBlocks++;
|
||||||
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr);
|
|
||||||
SumCount += BI->Count;
|
|
||||||
++BI;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do not change layout of functions w/o profile information
|
// Do not change layout of functions w/o profile information
|
||||||
if (SumCount == 0) {
|
if (NumHotBlocks == 0) {
|
||||||
for (auto BB : BF.layout()) {
|
for (auto BB : BF.layout()) {
|
||||||
Order.push_back(BB);
|
Order.push_back(BB);
|
||||||
}
|
}
|
||||||
|
@ -547,7 +618,7 @@ void CachePlusReorderAlgorithm::reorderBasicBlocks(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply the algorithm
|
// Apply the algorithm
|
||||||
Order = CachePlus(BF).run();
|
Order = CachePlus(BF, NumHotBlocks <= opts::ClusterSplitThreshold).run();
|
||||||
|
|
||||||
// Verify correctness
|
// Verify correctness
|
||||||
assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
|
assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
|
||||||
|
|
Loading…
Reference in New Issue