forked from OSchip/llvm-project
[BOLT] a new block reordering algorithm
Summary: A new block reordering algorithm, cache+, that is designed to optimize i-cache performance. On a high level, this algorithm is a greedy heuristic that merges clusters (ordered sequences) of basic blocks, similarly to how it is done in OptimizeCacheReorderAlgorithm. There are two important differences: (a) the metric that is optimized in the procedure, and (b) how two clusters are merged together. Initially all clusters are isolated basic blocks. On every iteration, we pick a pair of clusters whose merging yields the biggest increase in the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which models how i-cache "friendly" a pecific cluster is. A pair of clusters giving the maximum gain is merged to a new clusters. The procedure stops when there is only one cluster left, or when merging does not increase ExtTSP. In the latter case, the remaining clusters are sorted by density. An important aspect is the way two clusters are merged. Unlike earlier algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two clusters, X and Y, are first split into three, X1, X2, and Y. Then we consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. This improves the quality of the final result (the search space is larger) while keeping the implementation sufficiently fast. (cherry picked from FBD6466264)
This commit is contained in:
parent
1fa80594cf
commit
a599fe1bbc
|
@ -8,26 +8,65 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CacheMetrics.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace bolt;
|
||||
using Traversal = std::vector<BinaryBasicBlock *>;
|
||||
|
||||
// The weight of fallthrough jumps for ExtTSP metric
|
||||
constexpr double FallthroughWeight = 1.0;
|
||||
// The weight of forward jumps for ExtTSP metric
|
||||
constexpr double ForwardWeight = 1.0;
|
||||
// The weight of backward jumps for ExtTSP metric
|
||||
constexpr double BackwardWeight = 1.0;
|
||||
// The maximum distance (in bytes) of forward jumps for ExtTSP metric
|
||||
constexpr uint64_t ForwardDistance = 256;
|
||||
// The maximum distance (in bytes) of backward jumps for ExtTSP metric
|
||||
constexpr uint64_t BackwardDistance = 256;
|
||||
namespace opts {
|
||||
|
||||
// The size of the i-TLB cache page
|
||||
constexpr uint64_t ITLBPageSize = 4096;
|
||||
// Capacity of the i-TLB cache
|
||||
constexpr uint64_t ITLBEntries = 16;
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
|
||||
cl::opt<double>
|
||||
FallthroughWeight("fallthrough-weight",
|
||||
cl::desc("The weight of forward jumps for ExtTSP metric"),
|
||||
cl::init(1),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<double>
|
||||
ForwardWeight("forward-weight",
|
||||
cl::desc("The weight of forward jumps for ExtTSP metric"),
|
||||
cl::init(0.4),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<double>
|
||||
BackwardWeight("backward-weight",
|
||||
cl::desc("The weight of backward jumps for ExtTSP metric"),
|
||||
cl::init(0.4),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
ForwardDistance("forward-distance",
|
||||
cl::desc("The maximum distance (in bytes) of forward jumps for ExtTSP metric"),
|
||||
cl::init(768),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
BackwardDistance("backward-distance",
|
||||
cl::desc("The maximum distance (in bytes) of backward jumps for ExtTSP metric"),
|
||||
cl::init(192),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
ITLBPageSize("itlb-page-size",
|
||||
cl::desc("The size of i-tlb cache page"),
|
||||
cl::init(4096),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
ITLBEntries("itlb-entries",
|
||||
cl::desc("The number of entries in i-tlb cache"),
|
||||
cl::init(16),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -46,104 +85,6 @@ void extractBasicBlockInfo(
|
|||
}
|
||||
}
|
||||
|
||||
/// Initialize and return a vector of traversals for a given entry block
|
||||
std::vector<Traversal> getTraversals(BinaryBasicBlock *EntryBB) {
|
||||
std::vector<Traversal> AllTraversals;
|
||||
std::stack<std::pair<BinaryBasicBlock *, Traversal>> Stack;
|
||||
Stack.push(std::make_pair(EntryBB, Traversal()));
|
||||
std::unordered_set<BinaryBasicBlock *> BBSet;
|
||||
|
||||
while (!Stack.empty()) {
|
||||
BinaryBasicBlock *CurrentBB = Stack.top().first;
|
||||
Traversal PrevTraversal(Stack.top().second);
|
||||
Stack.pop();
|
||||
|
||||
// Add current basic block into consideration
|
||||
BBSet.insert(CurrentBB);
|
||||
PrevTraversal.push_back(CurrentBB);
|
||||
|
||||
if (CurrentBB->succ_empty()) {
|
||||
AllTraversals.push_back(PrevTraversal);
|
||||
continue;
|
||||
}
|
||||
|
||||
bool HaveSuccCount = false;
|
||||
// Calculate total edges count of successors
|
||||
for (auto BI = CurrentBB->branch_info_begin();
|
||||
BI != CurrentBB->branch_info_end(); ++BI) {
|
||||
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
|
||||
HaveSuccCount = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!HaveSuccCount) {
|
||||
AllTraversals.push_back(PrevTraversal);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto BI = CurrentBB->branch_info_begin();
|
||||
for (auto *SuccBB : CurrentBB->successors()) {
|
||||
// If we have never seen SuccBB, or SuccBB indicates the
|
||||
// end of traversal, SuccBB will be added into stack for
|
||||
// further exploring.
|
||||
if ((BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 &&
|
||||
BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) ||
|
||||
SuccBB->succ_empty()) {
|
||||
Stack.push(std::make_pair(SuccBB, PrevTraversal));
|
||||
}
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
return AllTraversals;
|
||||
}
|
||||
|
||||
/// Given a traversal, return the sum of block distances along this traversal.
|
||||
double getTraversalLength(
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
||||
const Traversal &Path) {
|
||||
double Length = 0;
|
||||
for (size_t I = 0; I + 1 < Path.size(); I++) {
|
||||
// Ignore calls between hot and cold parts
|
||||
if (Path[I]->isCold() != Path[I + 1]->isCold())
|
||||
continue;
|
||||
double SrcAddr = BBAddr.at(Path[I]);
|
||||
double DstAddr = BBAddr.at(Path[I + 1]);
|
||||
Length += std::abs(SrcAddr - DstAddr);
|
||||
}
|
||||
return Length;
|
||||
}
|
||||
|
||||
/// Calculate average number of call distance for every graph traversal
|
||||
double calcGraphDistance(
|
||||
const std::vector<BinaryFunction *> &BinaryFunctions,
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
||||
|
||||
double TotalTraversalLength = 0;
|
||||
double NumTraversals = 0;
|
||||
for (auto BF : BinaryFunctions) {
|
||||
// Only consider functions which are known to be executed
|
||||
if (BF->getKnownExecutionCount() == 0)
|
||||
continue;
|
||||
|
||||
for (auto BB : BF->layout()) {
|
||||
if (BB->isEntryPoint()) {
|
||||
auto AllTraversals = getTraversals(BB);
|
||||
for (auto const &Path : AllTraversals) {
|
||||
// Ignore short traversals
|
||||
if (Path.size() <= 1)
|
||||
continue;
|
||||
TotalTraversalLength += getTraversalLength(BBAddr, Path);
|
||||
NumTraversals++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TotalTraversalLength / NumTraversals;
|
||||
}
|
||||
|
||||
/// Calculate TSP metric, which quantifies the number of fallthrough jumps in
|
||||
/// the ordering of basic blocks
|
||||
double calcTSPScore(
|
||||
|
@ -166,22 +107,12 @@ double calcTSPScore(
|
|||
return Score;
|
||||
}
|
||||
|
||||
/// Calculate Extended-TSP metric, which quantifies the expected number of
|
||||
/// i-cache misses for a given ordering of basic blocks. The parameters are:
|
||||
/// - FallthroughWeight is the impact of fallthrough jumps on the score
|
||||
/// - ForwardWeight is the impact of forward (but not fallthrough) jumps
|
||||
/// - BackwardWeight is the impact of backward jumps
|
||||
/// - ForwardDistance is the max distance of a forward jump affecting the score
|
||||
/// - BackwardDistance is the max distance of a backward jump affecting the score
|
||||
/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
|
||||
/// misses for a given ordering of basic blocks
|
||||
double calcExtTSPScore(
|
||||
const std::vector<BinaryFunction *> &BinaryFunctions,
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize,
|
||||
double FallthroughWeight,
|
||||
double ForwardWeight,
|
||||
double BackwardWeight,
|
||||
uint64_t ForwardDistance,
|
||||
uint64_t BackwardDistance) {
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
||||
|
||||
double Score = 0.0;
|
||||
for (auto BF : BinaryFunctions) {
|
||||
|
@ -189,33 +120,10 @@ double calcExtTSPScore(
|
|||
auto BI = SrcBB->branch_info_begin();
|
||||
for (auto DstBB : SrcBB->successors()) {
|
||||
if (DstBB != SrcBB) {
|
||||
double Count = BI->Count == BinaryBasicBlock::COUNT_NO_PROFILE
|
||||
? 0.0
|
||||
: double(BI->Count);
|
||||
uint64_t SrcAddr = BBAddr.at(SrcBB);
|
||||
uint64_t SrcSize = BBSize.at(SrcBB);
|
||||
uint64_t DstAddr = BBAddr.at(DstBB);
|
||||
|
||||
if (SrcAddr <= DstAddr) {
|
||||
if (SrcAddr + SrcSize == DstAddr) {
|
||||
// fallthrough jump
|
||||
Score += FallthroughWeight * Count;
|
||||
} else {
|
||||
// the distance of the forward jump
|
||||
size_t Dist = DstAddr - (SrcAddr + SrcSize);
|
||||
if (Dist <= ForwardDistance) {
|
||||
double Prob = double(ForwardDistance - Dist) / ForwardDistance;
|
||||
Score += ForwardWeight * Prob * Count;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// the distance of the backward jump
|
||||
size_t Dist = SrcAddr + SrcSize - DstAddr;
|
||||
if (Dist <= BackwardDistance) {
|
||||
double Prob = double(BackwardDistance - Dist) / BackwardDistance;
|
||||
Score += BackwardWeight * Prob * Count;
|
||||
}
|
||||
}
|
||||
Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB),
|
||||
BBSize.at(SrcBB),
|
||||
BBAddr.at(DstBB),
|
||||
BI->Count);
|
||||
}
|
||||
++BI;
|
||||
}
|
||||
|
@ -277,10 +185,10 @@ extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
|
|||
double expectedCacheHitRatio(
|
||||
const std::vector<BinaryFunction *> &BinaryFunctions,
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize,
|
||||
double PageSize,
|
||||
uint64_t CacheEntries) {
|
||||
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
||||
|
||||
const double PageSize = opts::ITLBPageSize;
|
||||
const uint64_t CacheEntries = opts::ITLBEntries;
|
||||
auto Calls = extractFunctionCalls(BinaryFunctions);
|
||||
// Compute 'hotness' of the functions
|
||||
double TotalSamples = 0;
|
||||
|
@ -334,6 +242,34 @@ double expectedCacheHitRatio(
|
|||
return 100.0 * (1.0 - Misses / TotalSamples);
|
||||
}
|
||||
|
||||
} // end namespace anonymous
|
||||
|
||||
double CacheMetrics::extTSPScore(uint64_t SrcAddr,
|
||||
uint64_t SrcSize,
|
||||
uint64_t DstAddr,
|
||||
uint64_t Count) {
|
||||
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
|
||||
|
||||
// Fallthrough
|
||||
if (SrcAddr + SrcSize == DstAddr) {
|
||||
return opts::FallthroughWeight * Count;
|
||||
}
|
||||
// Forward
|
||||
if (SrcAddr + SrcSize < DstAddr) {
|
||||
const auto Dist = DstAddr - (SrcAddr + SrcSize);
|
||||
if (Dist <= opts::ForwardDistance) {
|
||||
double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
|
||||
return opts::ForwardWeight * Prob * Count;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
// Backward
|
||||
const auto Dist = SrcAddr + SrcSize - DstAddr;
|
||||
if (Dist <= opts::BackwardDistance) {
|
||||
double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
|
||||
return opts::BackwardWeight * Prob * Count;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void CacheMetrics::printAll(
|
||||
|
@ -356,10 +292,10 @@ void CacheMetrics::printAll(
|
|||
}
|
||||
|
||||
outs() << format(" There are %zu functions;", NumFunctions)
|
||||
<< format(" %zu (%.2lf%%) have non-empty execution count\n",
|
||||
<< format(" %zu (%.2lf%%) have positive execution count\n",
|
||||
NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions);
|
||||
outs() << format(" There are %zu basic blocks;", NumBlocks)
|
||||
<< format(" %zu (%.2lf%%) have non-empty execution count\n",
|
||||
<< format(" %zu (%.2lf%%) have positive execution count\n",
|
||||
NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks);
|
||||
|
||||
std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
|
||||
|
@ -377,35 +313,14 @@ void CacheMetrics::printAll(
|
|||
outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu)\n",
|
||||
100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize);
|
||||
|
||||
outs() << " An average length of graph traversal: "
|
||||
<< format("%.0lf\n", calcGraphDistance(BinaryFunctions,
|
||||
BBAddr,
|
||||
BBSize));
|
||||
|
||||
outs() << " Expected i-TLB cache hit ratio "
|
||||
<< format("(%zu, %zu): ", ITLBPageSize, ITLBEntries)
|
||||
outs() << " Expected i-TLB cache hit ratio: "
|
||||
<< format("%.2lf%%\n", expectedCacheHitRatio(BinaryFunctions,
|
||||
BBAddr,
|
||||
BBSize,
|
||||
ITLBPageSize,
|
||||
ITLBEntries));
|
||||
BBSize));
|
||||
|
||||
outs() << " TSP score: "
|
||||
<< format("%.0lf\n", calcTSPScore(BinaryFunctions, BBAddr, BBSize));
|
||||
|
||||
outs() << " ExtTSP score "
|
||||
<< format("(%.2lf, %.2lf, %.2lf, %zu, %zu): ", FallthroughWeight,
|
||||
ForwardWeight,
|
||||
BackwardWeight,
|
||||
ForwardDistance,
|
||||
BackwardDistance)
|
||||
<< format("%.0lf\n", calcExtTSPScore(BinaryFunctions,
|
||||
BBAddr,
|
||||
BBSize,
|
||||
FallthroughWeight,
|
||||
ForwardWeight,
|
||||
BackwardWeight,
|
||||
ForwardDistance,
|
||||
BackwardDistance));
|
||||
|
||||
outs() << " ExtTSP score: "
|
||||
<< format("%.0lf\n", calcExtTSPScore(BinaryFunctions, BBAddr, BBSize));
|
||||
}
|
||||
|
|
|
@ -20,6 +20,17 @@ namespace CacheMetrics {
|
|||
/// Calculate various metrics related to instruction cache performance.
|
||||
void printAll(const std::vector<BinaryFunction *> &BinaryFunctions);
|
||||
|
||||
/// Calculate Extended-TSP metric, which quantifies the expected number of
|
||||
/// i-cache misses for a given pair of basic blocks. The parameters are:
|
||||
/// - SrcAddr is the address of the source block;
|
||||
/// - SrcSize is the size of the source block;
|
||||
/// - DstAddr is the address of the destination block;
|
||||
/// - Count is the number of jumps between the pair of blocks.
|
||||
double extTSPScore(uint64_t SrcAddr,
|
||||
uint64_t SrcSize,
|
||||
uint64_t DstAddr,
|
||||
uint64_t Count);
|
||||
|
||||
} // namespace CacheMetrics
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
|
|
@ -161,6 +161,9 @@ ReorderBlocks("reorder-blocks",
|
|||
"cache",
|
||||
"perform optimal layout prioritizing I-cache "
|
||||
"behavior"),
|
||||
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS,
|
||||
"cache+",
|
||||
"perform layout optimizing I-cache behavior"),
|
||||
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
|
||||
"cluster-shuffle",
|
||||
"perform random layout of clusters"),
|
||||
|
@ -469,6 +472,10 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
|
|||
Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
|
||||
break;
|
||||
|
||||
case LT_OPTIMIZE_CACHE_PLUS:
|
||||
Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo)));
|
||||
break;
|
||||
|
||||
case LT_OPTIMIZE_SHUFFLE:
|
||||
Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
|
||||
break;
|
||||
|
|
|
@ -169,6 +169,8 @@ public:
|
|||
/// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04)
|
||||
/// that suggests putting frequently executed chains first in the layout.
|
||||
LT_OPTIMIZE_CACHE,
|
||||
/// Block reordering guided by the extended TSP metric.
|
||||
LT_OPTIMIZE_CACHE_PLUS,
|
||||
/// Create clusters and use random order for them.
|
||||
LT_OPTIMIZE_SHUFFLE,
|
||||
};
|
||||
|
|
|
@ -5,6 +5,7 @@ add_llvm_library(LLVMBOLTPasses
|
|||
BinaryFunctionCallGraph.cpp
|
||||
CallGraph.cpp
|
||||
CallGraphWalker.cpp
|
||||
CachePlusReorderAlgorithm.cpp
|
||||
DataflowAnalysis.cpp
|
||||
DataflowInfoManager.cpp
|
||||
FrameAnalysis.cpp
|
||||
|
|
|
@ -0,0 +1,476 @@
|
|||
//===--- CachePlusReorderAlgorithm.cpp - Order basic blocks ---------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "CacheMetrics.h"
|
||||
#include "ReorderAlgorithm.h"
|
||||
#include "ReorderUtils.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace bolt;
|
||||
using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
namespace {
|
||||
|
||||
// A cluster (ordered sequence) of basic blocks
|
||||
class Cluster {
|
||||
public:
|
||||
Cluster(BinaryBasicBlock *BB, uint64_t ExecutionCount_, uint64_t Size_)
|
||||
: Id(BB->getLayoutIndex()),
|
||||
IsEntry(BB->getLayoutIndex() == 0),
|
||||
ExecutionCount(ExecutionCount_),
|
||||
Size(Size_),
|
||||
Score(0) {
|
||||
Blocks.push_back(BB);
|
||||
}
|
||||
|
||||
size_t id() const {
|
||||
return Id;
|
||||
}
|
||||
|
||||
uint64_t size() const {
|
||||
return Size;
|
||||
}
|
||||
|
||||
double density() const {
|
||||
return static_cast<double>(ExecutionCount) / Size;
|
||||
}
|
||||
|
||||
bool isCold() const {
|
||||
return ExecutionCount == 0;
|
||||
}
|
||||
|
||||
uint64_t executionCount() const {
|
||||
return ExecutionCount;
|
||||
}
|
||||
|
||||
bool isEntryPoint() const {
|
||||
return IsEntry;
|
||||
}
|
||||
|
||||
double score() const {
|
||||
return Score;
|
||||
}
|
||||
|
||||
const std::vector<BinaryBasicBlock *> &blocks() const {
|
||||
return Blocks;
|
||||
}
|
||||
|
||||
/// Update the list of basic blocks and meta-info
|
||||
void merge(const Cluster *Other,
|
||||
const std::vector<BinaryBasicBlock *> &MergedBlocks,
|
||||
double MergedScore) {
|
||||
Blocks = MergedBlocks;
|
||||
IsEntry |= Other->IsEntry;
|
||||
ExecutionCount += Other->ExecutionCount;
|
||||
Size += Other->Size;
|
||||
Score = MergedScore;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<BinaryBasicBlock *> Blocks;
|
||||
size_t Id;
|
||||
bool IsEntry;
|
||||
uint64_t ExecutionCount;
|
||||
uint64_t Size;
|
||||
double Score;
|
||||
};
|
||||
|
||||
/// Deterministically compare clusters by their density in decreasing order
|
||||
bool compareClusters(const Cluster *C1, const Cluster *C2) {
|
||||
// original entry point to the front
|
||||
if (C1->isEntryPoint())
|
||||
return true;
|
||||
if (C2->isEntryPoint())
|
||||
return false;
|
||||
|
||||
const double D1 = C1->density();
|
||||
const double D2 = C2->density();
|
||||
if (D1 != D2)
|
||||
return D1 > D2;
|
||||
// Making the order deterministic
|
||||
return C1->id() < C2->id();
|
||||
}
|
||||
|
||||
/// Deterministically compare pairs of clusters
|
||||
bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
|
||||
const Cluster *A2, const Cluster *B2) {
|
||||
const auto Samples1 = A1->executionCount() + B1->executionCount();
|
||||
const auto Samples2 = A2->executionCount() + B2->executionCount();
|
||||
if (Samples1 != Samples2)
|
||||
return Samples1 < Samples2;
|
||||
|
||||
if (A1 != A2)
|
||||
return A1->id() < A2->id();
|
||||
return B1->id() < B2->id();
|
||||
}
|
||||
|
||||
} // end namespace anonymous
|
||||
|
||||
/// CachePlus - layout of basic blocks with i-cache optimization.
|
||||
///
|
||||
/// Similarly to OptimizeCacheReorderAlgorithm, this algorithm is a greedy
|
||||
/// heuristic that works with clusters (ordered sequences) of basic blocks.
|
||||
/// Initially all clusters are isolated basic blocks. On every iteration,
|
||||
/// we pick a pair of clusters whose merging yields the biggest increase in
|
||||
/// the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which
|
||||
/// models how i-cache "friendly" a specific cluster is. A pair of clusters
|
||||
/// giving the maximum gain is merged into a new cluster. The procedure stops
|
||||
/// when there is only one cluster left, or when merging does not increase
|
||||
/// ExtTSP. In the latter case, the remaining clusters are sorted by density.
|
||||
///
|
||||
/// An important aspect is the way two clusters are merged. Unlike earlier
|
||||
/// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
|
||||
/// clusters, X and Y, are first split into three, X1, X2, and Y. Then we
|
||||
/// consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y,
|
||||
/// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
|
||||
/// This improves the quality of the final result (the search space is larger)
|
||||
/// while keeping the implementation sufficiently fast.
|
||||
class CachePlus {
|
||||
public:
|
||||
CachePlus(const BinaryFunction &BF)
|
||||
: BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) {
|
||||
initialize();
|
||||
}
|
||||
|
||||
/// Run cache+ algorithm and return a basic block ordering
|
||||
std::vector<BinaryBasicBlock *> run() {
|
||||
// Merge pairs of clusters while there is an improvement in ExtTSP metric
|
||||
while (Clusters.size() > 1) {
|
||||
Cluster *BestClusterPred = nullptr;
|
||||
Cluster *BestClusterSucc = nullptr;
|
||||
std::pair<double, size_t> BestGain(-1, 0);
|
||||
for (auto ClusterPred : Clusters) {
|
||||
// Get candidates for merging with the current cluster
|
||||
Adjacent.forAllAdjacent(
|
||||
ClusterPred,
|
||||
// Find the best candidate
|
||||
[&](Cluster *ClusterSucc) {
|
||||
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
||||
// Do not merge cold blocks
|
||||
if (ClusterPred->isCold() || ClusterSucc->isCold())
|
||||
return;
|
||||
|
||||
// Compute the gain of merging two clusters
|
||||
auto Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||
if (Gain.first <= 0.0)
|
||||
return;
|
||||
|
||||
// Breaking ties by density to make the hottest clusters be merged first
|
||||
if (Gain.first > BestGain.first ||
|
||||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
|
||||
compareClusterPairs(ClusterPred,
|
||||
ClusterSucc,
|
||||
BestClusterPred,
|
||||
BestClusterSucc))) {
|
||||
BestGain = Gain;
|
||||
BestClusterPred = ClusterPred;
|
||||
BestClusterSucc = ClusterSucc;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Stop merging when there is no improvement
|
||||
if (BestGain.first <= 0.0)
|
||||
break;
|
||||
|
||||
// Merge the best pair of clusters
|
||||
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
|
||||
}
|
||||
|
||||
// Sorting clusters by density
|
||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||
|
||||
// Collect the basic blocks in the order specified by their clusters
|
||||
std::vector<BinaryBasicBlock *> Result;
|
||||
Result.reserve(BF.layout_size());
|
||||
for (auto Cluster : Clusters) {
|
||||
Result.insert(Result.end(),
|
||||
Cluster->blocks().begin(),
|
||||
Cluster->blocks().end());
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Initialize the set of active clusters, edges between blocks, and
|
||||
/// adjacency matrix.
|
||||
void initialize() {
|
||||
// Initialize indices of basic blocks
|
||||
size_t LayoutIndex = 0;
|
||||
for (auto BB : BF.layout()) {
|
||||
BB->setLayoutIndex(LayoutIndex);
|
||||
LayoutIndex++;
|
||||
}
|
||||
|
||||
// Initialize edges for the blocks and compute their total in/out weights
|
||||
OutEdges = std::vector<EdgeList>(BF.layout_size());
|
||||
auto InWeight = std::vector<uint64_t>(BF.layout_size(), 0);
|
||||
auto OutWeight = std::vector<uint64_t>(BF.layout_size(), 0);
|
||||
for (auto BB : BF.layout()) {
|
||||
auto BI = BB->branch_info_begin();
|
||||
for (auto I : BB->successors()) {
|
||||
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
|
||||
"missing profile for a jump");
|
||||
if (I != BB && BI->Count > 0) {
|
||||
InWeight[I->getLayoutIndex()] += BI->Count;
|
||||
OutEdges[BB->getLayoutIndex()].push_back(std::make_pair(I, BI->Count));
|
||||
OutWeight[BB->getLayoutIndex()] += BI->Count;
|
||||
}
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize execution count for every basic block, which is the
|
||||
// maximum over the sums of all in and out edge weights.
|
||||
// Also execution count of the entry point is set to at least 1
|
||||
auto ExecutionCounts = std::vector<uint64_t>(BF.layout_size(), 0);
|
||||
for (auto BB : BF.layout()) {
|
||||
uint64_t EC = BB->getKnownExecutionCount();
|
||||
EC = std::max(EC, InWeight[BB->getLayoutIndex()]);
|
||||
EC = std::max(EC, OutWeight[BB->getLayoutIndex()]);
|
||||
if (BB->getLayoutIndex() == 0)
|
||||
EC = std::max(EC, uint64_t(1));
|
||||
ExecutionCounts[BB->getLayoutIndex()] = EC;
|
||||
}
|
||||
|
||||
// Initialize clusters
|
||||
Clusters.reserve(BF.layout_size());
|
||||
AllClusters.reserve(BF.layout_size());
|
||||
Size.reserve(BF.layout_size());
|
||||
for (auto BB : BF.layout()) {
|
||||
size_t Index = BB->getLayoutIndex();
|
||||
Size.push_back(std::max(BB->estimateSize(), size_t(1)));
|
||||
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
|
||||
Clusters.push_back(&AllClusters[Index]);
|
||||
}
|
||||
|
||||
// Initialize adjacency matrix
|
||||
Adjacent.initialize(Clusters);
|
||||
for (auto BB : BF.layout()) {
|
||||
for (auto I : BB->successors()) {
|
||||
if (BB != I)
|
||||
Adjacent.set(Clusters[BB->getLayoutIndex()],
|
||||
Clusters[I->getLayoutIndex()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute ExtTSP score for a given order of basic blocks
|
||||
double score(const std::vector<BinaryBasicBlock *>& Blocks) const {
|
||||
uint64_t NotSet = static_cast<uint64_t>(-1);
|
||||
auto Addr = std::vector<uint64_t>(BF.layout_size(), NotSet);
|
||||
uint64_t CurAddr = 0;
|
||||
for (auto BB : Blocks) {
|
||||
size_t Index = BB->getLayoutIndex();
|
||||
Addr[Index] = CurAddr;
|
||||
CurAddr += Size[Index];
|
||||
}
|
||||
|
||||
double Score = 0;
|
||||
for (auto BB : Blocks) {
|
||||
size_t Index = BB->getLayoutIndex();
|
||||
for (auto Edge : OutEdges[Index]) {
|
||||
auto SuccBB = Edge.first;
|
||||
size_t SuccIndex = SuccBB->getLayoutIndex();
|
||||
|
||||
if (Addr[SuccBB->getLayoutIndex()] != NotSet) {
|
||||
Score += CacheMetrics::extTSPScore(Addr[Index],
|
||||
Size[Index],
|
||||
Addr[SuccIndex],
|
||||
Edge.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Score;
|
||||
}
|
||||
|
||||
/// The gain of merging two clusters.
|
||||
///
|
||||
/// The function considers all possible ways of merging two clusters and
|
||||
/// computes the one having the largest increase in ExtTSP metric. The result
|
||||
/// is a pair with the first element being the gain and the second element being
|
||||
/// the corresponding merging type (encoded as an integer).
|
||||
std::pair<double, size_t> mergeGain(const Cluster *ClusterPred,
|
||||
const Cluster *ClusterSucc) const {
|
||||
if (Cache.contains(ClusterPred, ClusterSucc)) {
|
||||
return Cache.get(ClusterPred, ClusterSucc);
|
||||
}
|
||||
|
||||
// The current score of two separate clusters
|
||||
const auto CurScore = ClusterPred->score() + ClusterSucc->score();
|
||||
|
||||
// Merge two clusters and update the best Gain
|
||||
auto computeMergeGain = [&](const std::pair<double, size_t> &CurGain,
|
||||
const Cluster *ClusterPred,
|
||||
const Cluster *ClusterSucc,
|
||||
size_t MergeType) {
|
||||
auto MergedBlocks = mergeBlocks(ClusterPred->blocks(),
|
||||
ClusterSucc->blocks(),
|
||||
MergeType);
|
||||
// Does the new cluster preserve the original entry point?
|
||||
if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
|
||||
MergedBlocks[0]->getLayoutIndex() != 0)
|
||||
return CurGain;
|
||||
|
||||
// The score of the new cluster
|
||||
const auto NewScore = score(MergedBlocks);
|
||||
if (NewScore > CurScore && NewScore - CurScore > CurGain.first)
|
||||
return std::make_pair(NewScore - CurScore, MergeType);
|
||||
else
|
||||
return CurGain;
|
||||
};
|
||||
|
||||
std::pair<double, size_t> Gain = std::make_pair(-1, 0);
|
||||
// Try to simply concatenate two clusters
|
||||
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
|
||||
// Try to split ClusterPred into two and merge with ClusterSucc
|
||||
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
|
||||
for (size_t Type = 0; Type < 4; Type++) {
|
||||
size_t MergeType = 1 + Type + Offset * 4;
|
||||
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
|
||||
}
|
||||
}
|
||||
|
||||
Cache.set(ClusterPred, ClusterSucc, Gain);
|
||||
return Gain;
|
||||
}
|
||||
|
||||
/// Merge two clusters (orders) of blocks according to a given 'merge type'.
|
||||
///
|
||||
/// If MergeType == 0, then the results is a concatentation of two clusters.
|
||||
/// Otherwise, the first cluster is cut into two and we consider all possible
|
||||
/// ways of concatenating three clusters.
|
||||
std::vector<BinaryBasicBlock *> mergeBlocks(
|
||||
const std::vector<BinaryBasicBlock *> &X,
|
||||
const std::vector<BinaryBasicBlock *> &Y,
|
||||
size_t MergeType
|
||||
) const {
|
||||
// Concatenate three clusters of blocks in the given order
|
||||
auto concat = [&](const std::vector<BinaryBasicBlock *> &A,
|
||||
const std::vector<BinaryBasicBlock *> &B,
|
||||
const std::vector<BinaryBasicBlock *> &C) {
|
||||
std::vector<BinaryBasicBlock *> Result;
|
||||
Result.reserve(A.size() + B.size() + C.size());
|
||||
Result.insert(Result.end(), A.begin(), A.end());
|
||||
Result.insert(Result.end(), B.begin(), B.end());
|
||||
Result.insert(Result.end(), C.begin(), C.end());
|
||||
return Result;
|
||||
};
|
||||
|
||||
// Merging w/o splitting existing clusters
|
||||
if (MergeType == 0) {
|
||||
return concat(X, Y, std::vector<BinaryBasicBlock *>());
|
||||
}
|
||||
|
||||
MergeType--;
|
||||
size_t Type = MergeType % 4;
|
||||
size_t Offset = MergeType / 4;
|
||||
assert(0 < Offset && Offset < X.size() &&
|
||||
"Invalid offset while merging clusters");
|
||||
// Split the first cluster, X, into X1 and X2
|
||||
std::vector<BinaryBasicBlock *> X1(X.begin(), X.begin() + Offset);
|
||||
std::vector<BinaryBasicBlock *> X2(X.begin() + Offset, X.end());
|
||||
|
||||
// Construct a new cluster from three existing ones
|
||||
switch(Type) {
|
||||
case 0: return concat(X1, Y, X2);
|
||||
case 1: return concat(Y, X2, X1);
|
||||
case 2: return concat(X2, Y, X1);
|
||||
case 3: return concat(X2, X1, Y);
|
||||
default:
|
||||
llvm_unreachable("unexpected merge type");
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge cluster From into cluster Into, update the list of active clusters,
|
||||
/// adjacency information, and the corresponding cache.
|
||||
void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) {
|
||||
assert(Into != From && "Cluster cannot be merged with itself");
|
||||
// Merge the clusters
|
||||
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
|
||||
Into->merge(From, MergedBlocks, score(MergedBlocks));
|
||||
|
||||
// Remove cluster From from the list of active clusters
|
||||
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
|
||||
Clusters.erase(Iter, Clusters.end());
|
||||
|
||||
// Invalidate caches
|
||||
Cache.invalidate(Into);
|
||||
|
||||
// Update the adjacency matrix
|
||||
Adjacent.merge(Into, From);
|
||||
}
|
||||
|
||||
// The binary function
|
||||
const BinaryFunction &BF;
|
||||
|
||||
// All clusters
|
||||
std::vector<Cluster> AllClusters;
|
||||
|
||||
// Active clusters. The vector gets udpated at runtime when clusters are merged
|
||||
std::vector<Cluster *> Clusters;
|
||||
|
||||
// Size of the block
|
||||
std::vector<uint64_t> Size;
|
||||
|
||||
// Outgoing edges of the block
|
||||
std::vector<EdgeList> OutEdges;
|
||||
|
||||
// Cluster adjacency matrix
|
||||
AdjacencyMatrix<Cluster> Adjacent;
|
||||
|
||||
// A cache that keeps precomputed values of mergeGain for pairs of clusters;
|
||||
// when a pair of clusters (x,y) gets merged, we invalidate the pairs
|
||||
// containing both x and y and all clusters adjacent to x and y (and recompute
|
||||
// them on the next iteration).
|
||||
mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;
|
||||
};
|
||||
|
||||
void CachePlusReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Are there jumps with positive execution count?
|
||||
uint64_t SumCount = 0;
|
||||
for (auto BB : BF.layout()) {
|
||||
auto BI = BB->branch_info_begin();
|
||||
for (auto I : BB->successors()) {
|
||||
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr);
|
||||
SumCount += BI->Count;
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
// Do not change layout of functions w/o profile information
|
||||
if (SumCount == 0) {
|
||||
for (auto BB : BF.layout()) {
|
||||
Order.push_back(BB);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Apply the algorithm
|
||||
Order = CachePlus(BF).run();
|
||||
|
||||
// Verify correctness
|
||||
assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
|
||||
assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout");
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
|
@ -29,11 +29,8 @@
|
|||
|
||||
#include "BinaryFunction.h"
|
||||
#include "HFSort.h"
|
||||
#include "llvm/ADT/BitVector.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/Format.h"
|
||||
#include "ReorderUtils.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
@ -48,21 +45,9 @@ using namespace bolt;
|
|||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<bool> Verbosity;
|
||||
|
||||
cl::opt<unsigned>
|
||||
ITLBPageSizeParam("itlb-page-size",
|
||||
cl::desc("The size of i-tlb cache page"),
|
||||
cl::init(4096),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
ITLBEntriesParam("itlb-entries",
|
||||
cl::desc("The number of entries in i-tlb cache"),
|
||||
cl::init(16),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
extern cl::opt<unsigned> ITLBPageSize;
|
||||
extern cl::opt<unsigned> ITLBEntries;
|
||||
|
||||
cl::opt<double>
|
||||
MergeProbability("merge-probability",
|
||||
|
@ -92,189 +77,46 @@ int32_t ITLBPageSize;
|
|||
// while smaller values result in better i-cache performance
|
||||
int32_t ITLBEntries;
|
||||
|
||||
// This class maintains adjacency information for all Clusters being
|
||||
// processed. It is used to invalidate cache entries when merging
|
||||
// Clusters and for visiting all neighbors of any given Cluster.
|
||||
class AdjacencyMatrix {
|
||||
public:
|
||||
AdjacencyMatrix(const CallGraph &Cg,
|
||||
std::vector<Cluster *> &Clusters,
|
||||
const std::vector<Cluster *> &FuncCluster)
|
||||
: Clusters(Clusters),
|
||||
Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) {
|
||||
initialize(Cg, FuncCluster);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void forallAdjacent(const Cluster *C, F Func) const {
|
||||
const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void forallAdjacent(const Cluster *C, F Func) {
|
||||
for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) {
|
||||
Func(Clusters[I]);
|
||||
}
|
||||
}
|
||||
|
||||
// Merge adjacency info from cluster B into cluster A. Info for cluster B is left
|
||||
// in an undefined state.
|
||||
void merge(const Cluster *A, const Cluster *B) {
|
||||
Bits[A->id()] |= Bits[B->id()];
|
||||
Bits[A->id()][A->id()] = false;
|
||||
Bits[A->id()][B->id()] = false;
|
||||
Bits[B->id()][A->id()] = false;
|
||||
for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) {
|
||||
Bits[I][A->id()] = true;
|
||||
Bits[I][B->id()] = false;
|
||||
}
|
||||
}
|
||||
|
||||
void dump(const Cluster *A) const {
|
||||
outs() << "Cluster " << A->id() << ":";
|
||||
forallAdjacent(A, [](const Cluster *B) { outs() << " " << B->id(); });
|
||||
}
|
||||
|
||||
void dump() const {
|
||||
for (auto *A : Clusters) {
|
||||
if (!A) continue;
|
||||
dump(A);
|
||||
outs() << "\n";
|
||||
}
|
||||
}
|
||||
private:
|
||||
void set(const Cluster *A, const Cluster *B, bool Value) {
|
||||
assert(A != B);
|
||||
Bits[A->id()][B->id()] = Value;
|
||||
Bits[B->id()][A->id()] = Value;
|
||||
}
|
||||
|
||||
void initialize(const CallGraph &Cg, const std::vector<Cluster *> &FuncCluster) {
|
||||
for (auto *A : Clusters) {
|
||||
for (auto TargetId : A->targets()) {
|
||||
for (auto Succ : Cg.successors(TargetId)) {
|
||||
auto *B = FuncCluster[Succ];
|
||||
if (!B || B == A) continue;
|
||||
const auto &Arc = *Cg.findArc(TargetId, Succ);
|
||||
if (Arc.weight() <= 0.0) continue;
|
||||
|
||||
set(A, B, true);
|
||||
}
|
||||
for (auto Pred : Cg.predecessors(TargetId)) {
|
||||
auto *B = FuncCluster[Pred];
|
||||
if (!B || B == A) continue;
|
||||
const auto &Arc = *Cg.findArc(Pred, TargetId);
|
||||
if (Arc.weight() <= 0.0) continue;
|
||||
|
||||
set(A, B, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Cluster *> Clusters;
|
||||
std::vector<BitVector> Bits;
|
||||
};
|
||||
|
||||
// A cache of precomputed results for a pair of clusters
|
||||
class PrecomputedResults {
|
||||
public:
|
||||
explicit PrecomputedResults(size_t Size)
|
||||
: Size(Size),
|
||||
Cache(new double[Size*Size]),
|
||||
Valid(Size * Size, false) {
|
||||
memset(Cache, 0, sizeof(double)*Size*Size);
|
||||
}
|
||||
~PrecomputedResults() {
|
||||
delete[] Cache;
|
||||
}
|
||||
|
||||
bool contains(const Cluster *First, const Cluster *Second) const {
|
||||
return Valid[index(First, Second)];
|
||||
}
|
||||
|
||||
double get(const Cluster *First, const Cluster *Second) const {
|
||||
assert(contains(First, Second));
|
||||
return Cache[index(First, Second)];
|
||||
}
|
||||
|
||||
void set(const Cluster *First, const Cluster *Second, double Value) {
|
||||
const auto Index = index(First, Second);
|
||||
Cache[Index] = Value;
|
||||
Valid[Index] = true;
|
||||
}
|
||||
|
||||
void invalidate(const Cluster *C) {
|
||||
Valid.reset(C->id() * Size, (C->id() + 1) * Size);
|
||||
for (size_t Id = 0; Id < Size; Id++) {
|
||||
Valid.reset(Id * Size + C->id());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
size_t index(const Cluster *First, const Cluster *Second) const {
|
||||
return First->id() * Size + Second->id();
|
||||
}
|
||||
|
||||
size_t Size;
|
||||
double *Cache;
|
||||
BitVector Valid;
|
||||
};
|
||||
|
||||
/*
|
||||
* Erase an element from a container if it is present. Otherwise, do nothing.
|
||||
*/
|
||||
template <typename C, typename V>
|
||||
void maybeErase(C &Container, const V& Value) {
|
||||
auto Itr = Container.find(Value);
|
||||
if (Itr != Container.end())
|
||||
Container.erase(Itr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Density of a cluster formed by merging a given pair of clusters
|
||||
*/
|
||||
/// Density of a cluster formed by merging a given pair of clusters.
|
||||
double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) {
|
||||
const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples();
|
||||
const double CombinedSize = ClusterPred->size() + ClusterSucc->size();
|
||||
return CombinedSamples / CombinedSize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Deterministically compare clusters by their density in decreasing order.
|
||||
*/
|
||||
/// Deterministically compare clusters by density in decreasing order.
|
||||
bool compareClusters(const Cluster *C1, const Cluster *C2) {
|
||||
const double D1 = C1->density();
|
||||
const double D2 = C2->density();
|
||||
if (D1 != D2) return D1 > D2;
|
||||
if (D1 != D2)
|
||||
return D1 > D2;
|
||||
// making sure the sorting is deterministic
|
||||
if (C1->size() != C2->size()) return C1->size() < C2->size();
|
||||
if (C1->samples() != C2->samples()) return C1->samples() > C2->samples();
|
||||
if (C1->size() != C2->size())
|
||||
return C1->size() < C2->size();
|
||||
if (C1->samples() != C2->samples())
|
||||
return C1->samples() > C2->samples();
|
||||
return C1->target(0) < C2->target(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Deterministically compare pairs of clusters by their density
|
||||
* in decreasing order.
|
||||
*/
|
||||
/// Deterministically compare pairs of clusters by density in decreasing order.
|
||||
bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
|
||||
const Cluster *A2, const Cluster *B2) {
|
||||
const auto D1 = density(A1, B1);
|
||||
const auto D2 = density(A2, B2);
|
||||
if (D1 != D2) return D1 > D2;
|
||||
// making sure the sorting is deterministic
|
||||
if (D1 != D2)
|
||||
return D1 > D2;
|
||||
const auto Size1 = A1->size() + B1->size();
|
||||
const auto Size2 = A2->size() + B2->size();
|
||||
if (Size1 != Size2) return Size1 < Size2;
|
||||
if (Size1 != Size2)
|
||||
return Size1 < Size2;
|
||||
const auto Samples1 = A1->samples() + B1->samples();
|
||||
const auto Samples2 = A2->samples() + B2->samples();
|
||||
if (Samples1 != Samples2) return Samples1 > Samples2;
|
||||
if (Samples1 != Samples2)
|
||||
return Samples1 > Samples2;
|
||||
return A1->target(0) < A2->target(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sorting clusters by their density in decreasing order
|
||||
*/
|
||||
/// Sorting clusters by their density in decreasing order.
|
||||
template <typename C>
|
||||
std::vector<Cluster *> sortByDensity(const C &Clusters_) {
|
||||
std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
|
||||
|
@ -282,27 +124,23 @@ std::vector<Cluster *> sortByDensity(const C &Clusters_) {
|
|||
return Clusters;
|
||||
}
|
||||
|
||||
/*
|
||||
* HFSortPlus - layout of hot functions with iTLB cache optimization
|
||||
*
|
||||
* Given an ordering of hot functions (and hence, their assignment to the
|
||||
* iTLB pages), we can divide all functions calls into two categories:
|
||||
* - 'short' ones that have a caller-callee distance less than a page;
|
||||
* - 'long' ones where the distance exceeds a page.
|
||||
* The short calls are likely to result in a iTLB cache hit. For the long ones,
|
||||
* the hit/miss result depends on the 'hotness' of the page (i.e., how often
|
||||
* the page is accessed). Assuming that functions are sent to the iTLB cache
|
||||
* in a random order, the probability that a page is present in the cache is
|
||||
* proportional to the number of samples corresponding to the functions on the
|
||||
* page. The following algorithm detects short and long calls, and optimizes
|
||||
* the expected number of cache misses for the long ones.
|
||||
*/
|
||||
/// HFSortPlus - layout of hot functions with iTLB cache optimization
|
||||
///
|
||||
/// Given an ordering of hot functions (and hence, their assignment to the
|
||||
/// iTLB pages), we can divide all functions calls into two categories:
|
||||
/// - 'short' ones that have a caller-callee distance less than a page;
|
||||
/// - 'long' ones where the distance exceeds a page.
|
||||
/// The short calls are likely to result in a iTLB cache hit. For the long ones,
|
||||
/// the hit/miss result depends on the 'hotness' of the page (i.e., how often
|
||||
/// the page is accessed). Assuming that functions are sent to the iTLB cache
|
||||
/// in a random order, the probability that a page is present in the cache is
|
||||
/// proportional to the number of samples corresponding to the functions on the
|
||||
/// page. The following algorithm detects short and long calls, and optimizes
|
||||
/// the expected number of cache misses for the long ones.
|
||||
class HFSortPlus {
|
||||
public:
|
||||
/*
|
||||
* The expected number of calls on different i-TLB pages for an arc of the
|
||||
* call graph with a specified weight
|
||||
*/
|
||||
/// The expected number of calls on different i-TLB pages for an arc of the
|
||||
/// call graph with a specified weight
|
||||
double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double Weight) const {
|
||||
const auto Dist = std::abs(SrcAddr - DstAddr);
|
||||
if (Dist > ITLBPageSize)
|
||||
|
@ -313,15 +151,13 @@ public:
|
|||
return (1.0 - X * X) * Weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* The probability that a page with a given weight is not present in the cache
|
||||
*
|
||||
* Assume that the hot functions are called in a random order; then the
|
||||
* probability of a i-TLB page being accessed after a function call is
|
||||
* p=pageSamples/totalSamples. The probability that the page is not accessed
|
||||
* is (1-p), and the probability that it is not in the cache (i.e. not accessed
|
||||
* during the last ITLBEntries function calls) is (1-p)^ITLBEntries
|
||||
*/
|
||||
/// The probability that a page with a given weight is not present in the cache
|
||||
///
|
||||
/// Assume that the hot functions are called in a random order; then the
|
||||
/// probability of a i-TLB page being accessed after a function call is
|
||||
/// p=pageSamples/totalSamples. The probability that the page is not accessed
|
||||
/// is (1-p), and the probability that it is not in the cache (i.e. not accessed
|
||||
/// during the last ITLBEntries function calls) is (1-p)^ITLBEntries
|
||||
double missProbability(double PageSamples) const {
|
||||
double P = PageSamples / TotalSamples;
|
||||
double X = ITLBEntries;
|
||||
|
@ -330,10 +166,8 @@ public:
|
|||
return pow(1.0 - P, X);
|
||||
}
|
||||
|
||||
/*
|
||||
* The expected number of calls within a given cluster with both endpoints on
|
||||
* the same cache page
|
||||
*/
|
||||
/// The expected number of calls within a given cluster with both endpoints on
|
||||
/// the same cache page
|
||||
double shortCalls(const Cluster *Cluster) const {
|
||||
double Calls = 0;
|
||||
for (auto TargetId : Cluster->targets()) {
|
||||
|
@ -352,10 +186,8 @@ public:
|
|||
return Calls;
|
||||
}
|
||||
|
||||
/*
|
||||
* The number of calls between the two clusters with both endpoints on
|
||||
* the same i-TLB page, assuming that a given pair of clusters gets merged
|
||||
*/
|
||||
/// The number of calls between the two clusters with both endpoints on
|
||||
/// the same i-TLB page, assuming that a given pair of clusters gets merged
|
||||
double shortCalls(const Cluster *ClusterPred,
|
||||
const Cluster *ClusterSucc) const {
|
||||
double Calls = 0;
|
||||
|
@ -389,18 +221,16 @@ public:
|
|||
return Calls;
|
||||
}
|
||||
|
||||
/*
|
||||
* The gain of merging two clusters.
|
||||
*
|
||||
* We assume that the final clusters are sorted by their density, and hence
|
||||
* every cluster is likely to be adjacent with clusters of the same density.
|
||||
* Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
|
||||
* which is used to compute the probability of cache misses for long calls
|
||||
* of a given cluster.
|
||||
* The result is also scaled by the size of the resulting cluster in order to
|
||||
* increse the chance of merging short clusters, which is helpful for
|
||||
* the i-cache performance.
|
||||
*/
|
||||
/// The gain of merging two clusters.
|
||||
///
|
||||
/// We assume that the final clusters are sorted by their density, and hence
|
||||
/// every cluster is likely to be adjacent with clusters of the same density.
|
||||
/// Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
|
||||
/// which is used to compute the probability of cache misses for long calls
|
||||
/// of a given cluster.
|
||||
/// The result is also scaled by the size of the resulting cluster in order to
|
||||
/// increse the chance of merging short clusters, which is helpful for
|
||||
/// the i-cache performance.
|
||||
double mergeGain(const Cluster *ClusterPred,
|
||||
const Cluster *ClusterSucc) const {
|
||||
if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) {
|
||||
|
@ -435,9 +265,7 @@ public:
|
|||
return Gain;
|
||||
}
|
||||
|
||||
/*
|
||||
* For every active cluster, compute its total weight of outgoing edges
|
||||
*/
|
||||
/// For every active cluster, compute its total weight of outgoing edges
|
||||
std::unordered_map<Cluster *, double> computeOutgoingWeight() {
|
||||
std::unordered_map<Cluster *, double> OutWeight;
|
||||
for (auto ClusterPred : Clusters) {
|
||||
|
@ -456,9 +284,7 @@ public:
|
|||
return OutWeight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find pairs of clusters that call each other with high probability
|
||||
*/
|
||||
/// Find pairs of clusters that call each other with high probability
|
||||
std::vector<std::pair<Cluster *, Cluster *>> findClustersToMerge() {
|
||||
// compute total weight of outgoing edges for every cluster
|
||||
auto OutWeight = computeOutgoingWeight();
|
||||
|
@ -503,10 +329,8 @@ public:
|
|||
return PairsToMerge;
|
||||
}
|
||||
|
||||
/*
|
||||
* Run the first optimization pass of the hfsort+ algorithm:
|
||||
* Merge clusters that call each other with high probability
|
||||
*/
|
||||
/// Run the first optimization pass of the hfsort+ algorithm:
|
||||
/// Merge clusters that call each other with high probability
|
||||
void runPassOne() {
|
||||
while (Clusters.size() > 1) {
|
||||
// pairs of clusters that will be merged on this iteration
|
||||
|
@ -523,11 +347,9 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Run the second optimization pass of the hfsort+ algorithm:
|
||||
* Merge pairs of clusters while there is an improvement in the
|
||||
* expected cache miss ratio
|
||||
*/
|
||||
/// Run the second optimization pass of the hfsort+ algorithm:
|
||||
/// Merge pairs of clusters while there is an improvement in the
|
||||
/// expected cache miss ratio
|
||||
void runPassTwo() {
|
||||
while (Clusters.size() > 1) {
|
||||
Cluster *BestClusterPred = nullptr;
|
||||
|
@ -535,7 +357,7 @@ public:
|
|||
double BestGain = -1;
|
||||
for (auto ClusterPred : Clusters) {
|
||||
// get candidates for merging with the current cluster
|
||||
Adjacent.forallAdjacent(
|
||||
Adjacent.forAllAdjacent(
|
||||
ClusterPred,
|
||||
// find the best candidate
|
||||
[&](Cluster *ClusterSucc) {
|
||||
|
@ -565,9 +387,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Run hfsort+ algorithm and return ordered set of function clusters.
|
||||
*/
|
||||
/// Run hfsort+ algorithm and return ordered set of function clusters.
|
||||
std::vector<Cluster> run() {
|
||||
DEBUG(dbgs() << "Starting hfsort+ w/"
|
||||
<< (UseGainCache ? "gain cache" : "no cache")
|
||||
|
@ -602,19 +422,37 @@ public:
|
|||
Addr(Cg.numNodes(), InvalidAddr),
|
||||
TotalSamples(0.0),
|
||||
Clusters(initializeClusters()),
|
||||
Adjacent(Cg, Clusters, FuncCluster),
|
||||
Adjacent(Cg.numNodes()),
|
||||
UseGainCache(UseGainCache),
|
||||
GainCache(Clusters.size()) {
|
||||
// Initialize adjacency matrix
|
||||
Adjacent.initialize(Clusters);
|
||||
for (auto *A : Clusters) {
|
||||
for (auto TargetId : A->targets()) {
|
||||
for (auto Succ : Cg.successors(TargetId)) {
|
||||
auto *B = FuncCluster[Succ];
|
||||
if (!B || B == A) continue;
|
||||
const auto &Arc = *Cg.findArc(TargetId, Succ);
|
||||
if (Arc.weight() > 0.0)
|
||||
Adjacent.set(A, B);
|
||||
}
|
||||
for (auto Pred : Cg.predecessors(TargetId)) {
|
||||
auto *B = FuncCluster[Pred];
|
||||
if (!B || B == A) continue;
|
||||
const auto &Arc = *Cg.findArc(Pred, TargetId);
|
||||
if (Arc.weight() > 0.0)
|
||||
Adjacent.set(A, B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
|
||||
/*
|
||||
* Initialize the set of active clusters, function id to cluster mapping,
|
||||
* total number of samples and function addresses.
|
||||
*/
|
||||
private:
|
||||
/// Initialize the set of active clusters, function id to cluster mapping,
|
||||
/// total number of samples and function addresses.
|
||||
std::vector<Cluster *> initializeClusters() {
|
||||
ITLBPageSize = opts::ITLBPageSizeParam;
|
||||
ITLBEntries = opts::ITLBEntriesParam;
|
||||
ITLBPageSize = opts::ITLBPageSize;
|
||||
ITLBEntries = opts::ITLBEntries;
|
||||
|
||||
// Initialize clusters
|
||||
std::vector<Cluster *> Clusters;
|
||||
|
@ -632,16 +470,8 @@ private:
|
|||
return Clusters;
|
||||
}
|
||||
|
||||
/*
|
||||
* Merge cluster From into cluster Into and update the list of active clusters
|
||||
*/
|
||||
/// Merge cluster From into cluster Into and update the list of active clusters
|
||||
void mergeClusters(Cluster *Into, Cluster *From) {
|
||||
DEBUG(
|
||||
if (opts::Verbosity > 0) {
|
||||
dbgs() << "Merging cluster " << From->id()
|
||||
<< " into cluster " << Into->id() << "\n";
|
||||
});
|
||||
|
||||
// The adjacency merge must happen before the Cluster::merge since that
|
||||
// clobbers the contents of From.
|
||||
Adjacent.merge(Into, From);
|
||||
|
@ -690,7 +520,7 @@ private:
|
|||
std::vector<Cluster *> Clusters;
|
||||
|
||||
// Cluster adjacency matrix
|
||||
AdjacencyMatrix Adjacent;
|
||||
AdjacencyMatrix<Cluster> Adjacent;
|
||||
|
||||
// Use cache for mergeGain results
|
||||
bool UseGainCache;
|
||||
|
@ -699,10 +529,10 @@ private:
|
|||
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
|
||||
// containing both x and y and all clusters adjacent to x and y (and recompute
|
||||
// them on the next iteration).
|
||||
mutable PrecomputedResults GainCache;
|
||||
mutable ClusterPairCache<Cluster, double> GainCache;
|
||||
};
|
||||
|
||||
}
|
||||
} // end namespace anonymous
|
||||
|
||||
std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache) {
|
||||
// It is required that the sum of incoming arc weights is not greater
|
||||
|
|
|
@ -243,6 +243,16 @@ public:
|
|||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
/// A new reordering algorithm for basic blocks, cache+
|
||||
class CachePlusReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit CachePlusReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
/// Toy example that simply reverses the original basic block order.
|
||||
class ReverseReorderAlgorithm : public ReorderAlgorithm {
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
// Passes/ReorderUtils.h - Helper methods for function and block reordering //
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "llvm/ADT/BitVector.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
// This class maintains adjacency information for all Clusters being
|
||||
// processed. It is used for visiting all neighbors of any given Cluster
|
||||
// while merging pairs of Clusters. Every Cluster must implement the id() method
|
||||
template <typename Cluster> class AdjacencyMatrix {
|
||||
public:
|
||||
explicit AdjacencyMatrix(size_t Size) : Bits(Size, BitVector(Size, false)) {}
|
||||
|
||||
void initialize(std::vector<Cluster *> &_Clusters) { Clusters = _Clusters; }
|
||||
|
||||
template <typename F> void forAllAdjacent(const Cluster *C, F Func) const {
|
||||
const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
|
||||
}
|
||||
|
||||
template <typename F> void forAllAdjacent(const Cluster *C, F Func) {
|
||||
for (auto I = Bits[C->id()].find_first(); I != -1;
|
||||
I = Bits[C->id()].find_next(I)) {
|
||||
Func(Clusters[I]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge adjacency info from cluster B into cluster A. Info for cluster B is
|
||||
/// left in an undefined state.
|
||||
void merge(const Cluster *A, const Cluster *B) {
|
||||
Bits[A->id()] |= Bits[B->id()];
|
||||
Bits[A->id()][A->id()] = false;
|
||||
Bits[A->id()][B->id()] = false;
|
||||
Bits[B->id()][A->id()] = false;
|
||||
for (auto I = Bits[B->id()].find_first(); I != -1;
|
||||
I = Bits[B->id()].find_next(I)) {
|
||||
Bits[I][A->id()] = true;
|
||||
Bits[I][B->id()] = false;
|
||||
}
|
||||
}
|
||||
|
||||
void set(const Cluster *A, const Cluster *B) { set(A, B, true); }
|
||||
|
||||
private:
|
||||
void set(const Cluster *A, const Cluster *B, bool Value) {
|
||||
assert(A != B);
|
||||
Bits[A->id()][B->id()] = Value;
|
||||
Bits[B->id()][A->id()] = Value;
|
||||
}
|
||||
|
||||
std::vector<Cluster *> Clusters;
|
||||
std::vector<BitVector> Bits;
|
||||
};
|
||||
|
||||
// This class holds cached results of specified type for a pair of Clusters.
|
||||
// It can invalidate all cache entries associated with a given Cluster.
|
||||
template <typename Cluster, typename ValueType> class ClusterPairCache {
|
||||
public:
|
||||
explicit ClusterPairCache(size_t Size)
|
||||
: Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
|
||||
|
||||
bool contains(const Cluster *First, const Cluster *Second) const {
|
||||
return Valid[index(First, Second)];
|
||||
}
|
||||
|
||||
ValueType get(const Cluster *First, const Cluster *Second) const {
|
||||
assert(contains(First, Second));
|
||||
return Cache[index(First, Second)];
|
||||
}
|
||||
|
||||
void set(const Cluster *First, const Cluster *Second, ValueType Value) {
|
||||
const auto Index = index(First, Second);
|
||||
Cache[Index] = Value;
|
||||
Valid[Index] = true;
|
||||
}
|
||||
|
||||
void invalidate(const Cluster *C) {
|
||||
Valid.reset(C->id() * Size, (C->id() + 1) * Size);
|
||||
for (size_t id = 0; id < Size; id++) {
|
||||
Valid.reset((id * Size) + C->id());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
size_t index(const Cluster *First, const Cluster *Second) const {
|
||||
return (First->id() * Size) + Second->id();
|
||||
}
|
||||
|
||||
size_t Size;
|
||||
std::vector<ValueType> Cache;
|
||||
BitVector Valid;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue