forked from OSchip/llvm-project
Refactoring of the reordering algorithms
Summary: The various reorder and clustering algorithms have been refactored into separate classes, so that it is easier to add new algorithms and/or change the logic of algorithm selection. (cherry picked from FBD3473656)
This commit is contained in:
parent
f1192a7118
commit
d09b00ebff
|
@ -85,9 +85,6 @@ class BinaryBasicBlock {
|
|||
|
||||
/// Each successor has a corresponding BranchInfo entry in the list.
|
||||
std::vector<BinaryBranchInfo> BranchInfo;
|
||||
typedef std::vector<BinaryBranchInfo>::iterator branch_info_iterator;
|
||||
typedef std::vector<BinaryBranchInfo>::const_iterator
|
||||
const_branch_info_iterator;
|
||||
|
||||
BinaryBasicBlock() {}
|
||||
|
||||
|
@ -252,6 +249,25 @@ public:
|
|||
return iterator_range<const_lp_iterator>(lp_begin(), lp_end());
|
||||
}
|
||||
|
||||
// BranchInfo iterators.
|
||||
typedef std::vector<BinaryBranchInfo>::const_iterator
|
||||
const_branch_info_iterator;
|
||||
|
||||
const_branch_info_iterator branch_info_begin() const
|
||||
{ return BranchInfo.begin(); }
|
||||
const_branch_info_iterator branch_info_end() const
|
||||
{ return BranchInfo.end(); }
|
||||
unsigned branch_info_size() const {
|
||||
return (unsigned)BranchInfo.size();
|
||||
}
|
||||
bool branch_info_empty() const
|
||||
{ return BranchInfo.empty(); }
|
||||
|
||||
inline iterator_range<const_branch_info_iterator> branch_info() const {
|
||||
return iterator_range<const_branch_info_iterator>(
|
||||
branch_info_begin(), branch_info_end());
|
||||
}
|
||||
|
||||
/// Return symbol marking the start of this basic block.
|
||||
MCSymbol *getLabel() const {
|
||||
return Label;
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "ReorderAlgorithm.h"
|
||||
#include "DataReader.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
|
||||
|
@ -41,9 +42,6 @@ AgressiveSplitting("split-all-cold",
|
|||
cl::desc("outline as many cold basic blocks as possible"),
|
||||
cl::Optional);
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional);
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintDebugInfo("print-debug-info",
|
||||
cl::desc("print debug info when printing functions"),
|
||||
|
@ -1254,378 +1252,47 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) {
|
|||
if (BasicBlocksLayout.empty() || Type == LT_NONE)
|
||||
return;
|
||||
|
||||
if (Type == LT_REVERSE) {
|
||||
BasicBlockOrderType ReverseOrder;
|
||||
auto FirstBB = BasicBlocksLayout.front();
|
||||
ReverseOrder.push_back(FirstBB);
|
||||
for (auto RBBI = BasicBlocksLayout.rbegin(); *RBBI != FirstBB; ++RBBI)
|
||||
ReverseOrder.push_back(*RBBI);
|
||||
BasicBlocksLayout.swap(ReverseOrder);
|
||||
|
||||
if (Split)
|
||||
splitFunction();
|
||||
|
||||
fixBranches();
|
||||
|
||||
return;
|
||||
}
|
||||
BasicBlockOrderType NewLayout;
|
||||
std::unique_ptr<ReorderAlgorithm> Algo;
|
||||
|
||||
// Cannot do optimal layout without profile.
|
||||
if (!hasValidProfile())
|
||||
if (Type != LT_REVERSE && !hasValidProfile())
|
||||
return;
|
||||
|
||||
// Work on optimal solution if problem is small enough
|
||||
if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD)
|
||||
return solveOptimalLayout(Split);
|
||||
if (Type == LT_REVERSE) {
|
||||
Algo.reset(new ReverseReorderAlgorithm());
|
||||
}
|
||||
else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) {
|
||||
// Work on optimal solution if problem is small enough
|
||||
DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n");
|
||||
Algo.reset(new OptimalReorderAlgorithm());
|
||||
}
|
||||
else {
|
||||
DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n");
|
||||
|
||||
DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n");
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo(new GreedyClusterAlgorithm());
|
||||
|
||||
// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
|
||||
// maximize weight during a path traversing all BBs. In this way, we will
|
||||
// convert the hottest branches into fall-throughs.
|
||||
switch(Type) {
|
||||
case LT_OPTIMIZE:
|
||||
Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
|
||||
break;
|
||||
|
||||
// Encode an edge between two basic blocks, source and destination
|
||||
typedef std::pair<BinaryBasicBlock *, BinaryBasicBlock *> EdgeTy;
|
||||
std::map<EdgeTy, uint64_t> Weight;
|
||||
case LT_OPTIMIZE_BRANCH:
|
||||
Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
|
||||
break;
|
||||
|
||||
// Define a comparison function to establish SWO between edges
|
||||
auto Comp = [&] (EdgeTy A, EdgeTy B) {
|
||||
// With equal weights, prioritize branches with lower index
|
||||
// source/destination. This helps to keep original block order for blocks
|
||||
// when optimal order cannot be deducted from a profile.
|
||||
if (Weight[A] == Weight[B]) {
|
||||
uint32_t ASrcBBIndex = getIndex(A.first);
|
||||
uint32_t BSrcBBIndex = getIndex(B.first);
|
||||
if (ASrcBBIndex != BSrcBBIndex)
|
||||
return ASrcBBIndex > BSrcBBIndex;
|
||||
return getIndex(A.second) > getIndex(B.second);
|
||||
}
|
||||
return Weight[A] < Weight[B];
|
||||
};
|
||||
std::priority_queue<EdgeTy, std::vector<EdgeTy>, decltype(Comp)> Queue(Comp);
|
||||
case LT_OPTIMIZE_CACHE:
|
||||
Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
|
||||
break;
|
||||
|
||||
typedef std::vector<BinaryBasicBlock *> ClusterTy;
|
||||
typedef std::map<BinaryBasicBlock *, int> BBToClusterMapTy;
|
||||
std::vector<ClusterTy> Clusters;
|
||||
BBToClusterMapTy BBToClusterMap;
|
||||
|
||||
// Encode relative weights between two clusters
|
||||
std::vector<std::map<uint32_t, uint64_t>> ClusterEdges;
|
||||
ClusterEdges.resize(BasicBlocksLayout.size());
|
||||
|
||||
for (auto BB : BasicBlocksLayout) {
|
||||
// Create a cluster for this BB
|
||||
uint32_t I = Clusters.size();
|
||||
Clusters.emplace_back();
|
||||
auto &Cluster = Clusters.back();
|
||||
Cluster.push_back(BB);
|
||||
BBToClusterMap[BB] = I;
|
||||
// Populate priority queue with edges
|
||||
auto BI = BB->BranchInfo.begin();
|
||||
for (auto &I : BB->successors()) {
|
||||
if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
|
||||
Weight[std::make_pair(BB, I)] = BI->Count;
|
||||
Queue.push(std::make_pair(BB, I));
|
||||
++BI;
|
||||
default:
|
||||
llvm_unreachable("unexpected layout type");
|
||||
}
|
||||
}
|
||||
|
||||
// Grow clusters in a greedy fashion
|
||||
while (!Queue.empty()) {
|
||||
auto elmt = Queue.top();
|
||||
Queue.pop();
|
||||
|
||||
BinaryBasicBlock *BBSrc = elmt.first;
|
||||
BinaryBasicBlock *BBDst = elmt.second;
|
||||
|
||||
// Case 1: BBSrc and BBDst are the same. Ignore this edge
|
||||
if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin())
|
||||
continue;
|
||||
|
||||
int I = BBToClusterMap[BBSrc];
|
||||
int J = BBToClusterMap[BBDst];
|
||||
|
||||
// Case 2: If they are already allocated at the same cluster, just increase
|
||||
// the weight of this cluster
|
||||
if (I == J) {
|
||||
ClusterEdges[I][I] += Weight[elmt];
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &ClusterA = Clusters[I];
|
||||
auto &ClusterB = Clusters[J];
|
||||
if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
|
||||
// Case 3: BBSrc is at the end of a cluster and BBDst is at the start,
|
||||
// allowing us to merge two clusters
|
||||
for (auto BB : ClusterB)
|
||||
BBToClusterMap[BB] = I;
|
||||
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
|
||||
ClusterB.clear();
|
||||
// Iterate through all inter-cluster edges and transfer edges targeting
|
||||
// cluster B to cluster A.
|
||||
// It is bad to have to iterate though all edges when we could have a list
|
||||
// of predecessors for cluster B. However, it's not clear if it is worth
|
||||
// the added code complexity to create a data structure for clusters that
|
||||
// maintains a list of predecessors. Maybe change this if it becomes a
|
||||
// deal breaker.
|
||||
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
|
||||
ClusterEdges[K][I] += ClusterEdges[K][J];
|
||||
} else {
|
||||
// Case 4: Both BBSrc and BBDst are allocated in positions we cannot
|
||||
// merge them. Annotate the weight of this edge in the weight between
|
||||
// clusters to help us decide ordering between these clusters.
|
||||
ClusterEdges[I][J] += Weight[elmt];
|
||||
}
|
||||
}
|
||||
std::vector<uint32_t> Order; // Cluster layout order
|
||||
|
||||
// Here we have 3 conflicting goals as to how to layout clusters. If we want
|
||||
// to minimize jump offsets, we should put clusters with heavy inter-cluster
|
||||
// dependence as close as possible. If we want to maximize the probability
|
||||
// that all inter-cluster edges are predicted as not-taken, we should enforce
|
||||
// a topological order to make targets appear after sources, creating forward
|
||||
// branches. If we want to separate hot from cold blocks to maximize the
|
||||
// probability that unfrequently executed code doesn't pollute the cache, we
|
||||
// should put clusters in descending order of hotness.
|
||||
std::vector<double> AvgFreq;
|
||||
AvgFreq.resize(Clusters.size(), 0.0);
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
double Freq = 0.0;
|
||||
for (auto BB : Clusters[I]) {
|
||||
if (!BB->empty() && BB->size() != BB->getNumPseudos())
|
||||
Freq += ((double) BB->getExecutionCount()) /
|
||||
(BB->size() - BB->getNumPseudos());
|
||||
}
|
||||
AvgFreq[I] = Freq;
|
||||
}
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
errs() << "Cluster number " << I << " (frequency: " << AvgFreq[I]
|
||||
<< ") : ";
|
||||
auto Sep = "";
|
||||
for (auto BB : Clusters[I]) {
|
||||
errs() << Sep << BB->getName();
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << "\n";
|
||||
};
|
||||
}
|
||||
|
||||
switch(Type) {
|
||||
case LT_OPTIMIZE: {
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!Clusters[I].empty())
|
||||
Order.push_back(I);
|
||||
break;
|
||||
}
|
||||
case LT_OPTIMIZE_BRANCH: {
|
||||
// Do a topological sort for clusters, prioritizing frequently-executed BBs
|
||||
// during the traversal.
|
||||
std::stack<uint32_t> Stack;
|
||||
std::vector<uint32_t> Status;
|
||||
std::vector<uint32_t> Parent;
|
||||
Status.resize(Clusters.size(), 0);
|
||||
Parent.resize(Clusters.size(), 0);
|
||||
constexpr uint32_t STACKED = 1;
|
||||
constexpr uint32_t VISITED = 2;
|
||||
Status[0] = STACKED;
|
||||
Stack.push(0);
|
||||
while (!Stack.empty()) {
|
||||
uint32_t I = Stack.top();
|
||||
if (!(Status[I] & VISITED)) {
|
||||
Status[I] |= VISITED;
|
||||
// Order successors by weight
|
||||
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
|
||||
return ClusterEdges[I][A] > ClusterEdges[I][B];
|
||||
};
|
||||
std::priority_queue<uint32_t, std::vector<uint32_t>,
|
||||
decltype(ClusterComp)> SuccQueue(ClusterComp);
|
||||
for (auto &Target: ClusterEdges[I]) {
|
||||
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
|
||||
!Clusters[Target.first].empty()) {
|
||||
Parent[Target.first] = I;
|
||||
Status[Target.first] = STACKED;
|
||||
SuccQueue.push(Target.first);
|
||||
}
|
||||
}
|
||||
while (!SuccQueue.empty()) {
|
||||
Stack.push(SuccQueue.top());
|
||||
SuccQueue.pop();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Already visited this node
|
||||
Stack.pop();
|
||||
Order.push_back(I);
|
||||
}
|
||||
std::reverse(Order.begin(), Order.end());
|
||||
// Put unreachable clusters at the end
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!(Status[I] & VISITED) && !Clusters[I].empty())
|
||||
Order.push_back(I);
|
||||
|
||||
// Sort nodes with equal precedence
|
||||
auto Beg = Order.begin();
|
||||
// Don't reorder the first cluster, which contains the function entry point
|
||||
++Beg;
|
||||
std::stable_sort(Beg, Order.end(),
|
||||
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
|
||||
uint32_t P = Parent[A];
|
||||
while (Parent[P] != 0) {
|
||||
if (Parent[P] == B)
|
||||
return false;
|
||||
P = Parent[P];
|
||||
}
|
||||
P = Parent[B];
|
||||
while (Parent[P] != 0) {
|
||||
if (Parent[P] == A)
|
||||
return true;
|
||||
P = Parent[P];
|
||||
}
|
||||
return AvgFreq[A] > AvgFreq[B];
|
||||
});
|
||||
break;
|
||||
}
|
||||
case LT_OPTIMIZE_CACHE: {
|
||||
// Order clusters based on average instruction execution frequency
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!Clusters[I].empty())
|
||||
Order.push_back(I);
|
||||
auto Beg = Order.begin();
|
||||
// Don't reorder the first cluster, which contains the function entry point
|
||||
++Beg;
|
||||
std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) {
|
||||
return AvgFreq[A] > AvgFreq[B];
|
||||
});
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("unexpected layout type");
|
||||
}
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
errs() << "New cluster order: ";
|
||||
auto Sep = "";
|
||||
for (auto O : Order) {
|
||||
errs() << Sep << O;
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << '\n';
|
||||
}
|
||||
|
||||
Algo->reorderBasicBlocks(*this, NewLayout);
|
||||
BasicBlocksLayout.clear();
|
||||
for (auto I : Order) {
|
||||
auto &Cluster = Clusters[I];
|
||||
BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(),
|
||||
Cluster.end());
|
||||
}
|
||||
|
||||
if (Split)
|
||||
splitFunction();
|
||||
fixBranches();
|
||||
}
|
||||
|
||||
void BinaryFunction::solveOptimalLayout(bool Split) {
|
||||
std::vector<std::vector<uint64_t>> Weight;
|
||||
std::map<BinaryBasicBlock *, int> BBToIndex;
|
||||
std::vector<BinaryBasicBlock *> IndexToBB;
|
||||
|
||||
DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n");
|
||||
|
||||
unsigned N = BasicBlocksLayout.size();
|
||||
// Populating weight map and index map
|
||||
for (auto BB : BasicBlocksLayout) {
|
||||
BBToIndex[BB] = IndexToBB.size();
|
||||
IndexToBB.push_back(BB);
|
||||
}
|
||||
Weight.resize(N);
|
||||
for (auto BB : BasicBlocksLayout) {
|
||||
auto BI = BB->BranchInfo.begin();
|
||||
Weight[BBToIndex[BB]].resize(N);
|
||||
for (auto I : BB->successors()) {
|
||||
if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
|
||||
Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> DP;
|
||||
DP.resize(1 << N);
|
||||
for (auto &Elmt : DP) {
|
||||
Elmt.resize(N, -1);
|
||||
}
|
||||
// Start with the entry basic block being allocated with cost zero
|
||||
DP[1][0] = 0;
|
||||
// Walk through TSP solutions using a bitmask to represent state (current set
|
||||
// of BBs in the layout)
|
||||
unsigned BestSet = 1;
|
||||
unsigned BestLast = 0;
|
||||
int64_t BestWeight = 0;
|
||||
for (unsigned Set = 1; Set < (1U << N); ++Set) {
|
||||
// Traverse each possibility of Last BB visited in this layout
|
||||
for (unsigned Last = 0; Last < N; ++Last) {
|
||||
// Case 1: There is no possible layout with this BB as Last
|
||||
if (DP[Set][Last] == -1)
|
||||
continue;
|
||||
|
||||
// Case 2: There is a layout with this Set and this Last, and we try
|
||||
// to expand this set with New
|
||||
for (unsigned New = 1; New < N; ++New) {
|
||||
// Case 2a: BB "New" is already in this Set
|
||||
if ((Set & (1 << New)) != 0)
|
||||
continue;
|
||||
|
||||
// Case 2b: BB "New" is not in this set and we add it to this Set and
|
||||
// record total weight of this layout with "New" as the last BB.
|
||||
unsigned NewSet = (Set | (1 << New));
|
||||
if (DP[NewSet][New] == -1)
|
||||
DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
|
||||
DP[NewSet][New] = std::max(DP[NewSet][New],
|
||||
DP[Set][Last] + (int64_t)Weight[Last][New]);
|
||||
|
||||
if (DP[NewSet][New] > BestWeight) {
|
||||
BestWeight = DP[NewSet][New];
|
||||
BestSet = NewSet;
|
||||
BestLast = New;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<BinaryBasicBlock *> PastLayout = BasicBlocksLayout;
|
||||
|
||||
// Define final function layout based on layout that maximizes weight
|
||||
BasicBlocksLayout.clear();
|
||||
unsigned Last = BestLast;
|
||||
unsigned Set = BestSet;
|
||||
std::vector<bool> Visited;
|
||||
Visited.resize(N);
|
||||
Visited[Last] = true;
|
||||
BasicBlocksLayout.push_back(IndexToBB[Last]);
|
||||
Set = Set & ~(1U << Last);
|
||||
while (Set != 0) {
|
||||
int64_t Best = -1;
|
||||
for (unsigned I = 0; I < N; ++I) {
|
||||
if (DP[Set][I] == -1)
|
||||
continue;
|
||||
if (DP[Set][I] > Best) {
|
||||
Last = I;
|
||||
Best = DP[Set][I];
|
||||
}
|
||||
}
|
||||
Visited[Last] = true;
|
||||
BasicBlocksLayout.push_back(IndexToBB[Last]);
|
||||
Set = Set & ~(1U << Last);
|
||||
}
|
||||
std::reverse(BasicBlocksLayout.begin(), BasicBlocksLayout.end());
|
||||
|
||||
// Finalize layout with BBs that weren't assigned to the layout
|
||||
for (auto BB : PastLayout) {
|
||||
if (Visited[BBToIndex[BB]] == false)
|
||||
BasicBlocksLayout.push_back(BB);
|
||||
}
|
||||
BasicBlocksLayout.swap(NewLayout);
|
||||
|
||||
if (Split)
|
||||
splitFunction();
|
||||
|
|
|
@ -306,6 +306,9 @@ public:
|
|||
|
||||
typedef BasicBlockOrderType::iterator order_iterator;
|
||||
typedef BasicBlockOrderType::const_iterator const_order_iterator;
|
||||
typedef BasicBlockOrderType::reverse_iterator reverse_order_iterator;
|
||||
typedef BasicBlockOrderType::const_reverse_iterator
|
||||
const_reverse_order_iterator;
|
||||
|
||||
// CFG iterators.
|
||||
iterator begin() { return BasicBlocks.begin(); }
|
||||
|
@ -325,19 +328,39 @@ public:
|
|||
const BinaryBasicBlock & back() const { return *BasicBlocks.back(); }
|
||||
BinaryBasicBlock & back() { return *BasicBlocks.back(); }
|
||||
|
||||
unsigned layout_size() const {
|
||||
return (unsigned)BasicBlocksLayout.size();
|
||||
}
|
||||
const_order_iterator layout_begin() const {
|
||||
return BasicBlocksLayout.begin();
|
||||
}
|
||||
order_iterator layout_begin() { return BasicBlocksLayout.begin(); }
|
||||
order_iterator layout_begin() { return BasicBlocksLayout.begin(); }
|
||||
const_order_iterator layout_begin() const
|
||||
{ return BasicBlocksLayout.begin(); }
|
||||
order_iterator layout_end() { return BasicBlocksLayout.end(); }
|
||||
const_order_iterator layout_end() const
|
||||
{ return BasicBlocksLayout.end(); }
|
||||
reverse_order_iterator layout_rbegin()
|
||||
{ return BasicBlocksLayout.rbegin(); }
|
||||
const_reverse_order_iterator layout_rbegin() const
|
||||
{ return BasicBlocksLayout.rbegin(); }
|
||||
reverse_order_iterator layout_rend()
|
||||
{ return BasicBlocksLayout.rend(); }
|
||||
const_reverse_order_iterator layout_rend() const
|
||||
{ return BasicBlocksLayout.rend(); }
|
||||
unsigned layout_size() const { return (unsigned)BasicBlocksLayout.size(); }
|
||||
bool layout_empty() const { return BasicBlocksLayout.empty(); }
|
||||
const BinaryBasicBlock *layout_front() const
|
||||
{ return BasicBlocksLayout.front(); }
|
||||
BinaryBasicBlock *layout_front() { return BasicBlocksLayout.front(); }
|
||||
const BinaryBasicBlock *layout_back() const
|
||||
{ return BasicBlocksLayout.back(); }
|
||||
BinaryBasicBlock *layout_back() { return BasicBlocksLayout.back(); }
|
||||
|
||||
inline iterator_range<order_iterator> layout() {
|
||||
return iterator_range<order_iterator>(BasicBlocksLayout.begin(),
|
||||
BasicBlocksLayout.end());
|
||||
}
|
||||
|
||||
inline iterator_range<const_order_iterator> layout() const {
|
||||
return iterator_range<const_order_iterator>(BasicBlocksLayout.begin(),
|
||||
BasicBlocksLayout.end());
|
||||
}
|
||||
|
||||
cfi_iterator cie_begin() { return CIEFrameInstructions.begin(); }
|
||||
const_cfi_iterator cie_begin() const { return CIEFrameInstructions.begin(); }
|
||||
cfi_iterator cie_end() { return CIEFrameInstructions.end(); }
|
||||
|
@ -368,14 +391,6 @@ public:
|
|||
/// end of basic blocks.
|
||||
void modifyLayout(LayoutType Type, bool Split);
|
||||
|
||||
/// Dynamic programming implementation for the TSP, applied to BB layout. Find
|
||||
/// the optimal way to maximize weight during a path traversing all BBs. In
|
||||
/// this way, we will convert the hottest branches into fall-throughs.
|
||||
///
|
||||
/// Uses exponential amount of memory on the number of basic blocks and should
|
||||
/// only be used for small functions.
|
||||
void solveOptimalLayout(bool Split);
|
||||
|
||||
/// View CFG in graphviz program
|
||||
void viewGraph();
|
||||
|
||||
|
|
|
@ -24,4 +24,5 @@ add_llvm_tool(llvm-bolt
|
|||
DebugData.cpp
|
||||
Exceptions.cpp
|
||||
RewriteInstance.cpp
|
||||
ReorderAlgorithm.cpp
|
||||
)
|
||||
|
|
|
@ -0,0 +1,436 @@
|
|||
//===--- ReorderAlgorithm.cpp - Basic block reorderng algorithms ----------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Implements different basic block reordering algorithms.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ReorderAlgorithm.h"
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include <queue>
|
||||
|
||||
using namespace llvm;
|
||||
using namespace bolt;
|
||||
|
||||
namespace opts {
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional);
|
||||
|
||||
} // namespace opts
|
||||
|
||||
void ClusterAlgorithm::computeClusterAverageFrequency() {
|
||||
AvgFreq.resize(Clusters.size(), 0.0);
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
double Freq = 0.0;
|
||||
for (auto BB : Clusters[I]) {
|
||||
if (!BB->empty() && BB->size() != BB->getNumPseudos())
|
||||
Freq += ((double) BB->getExecutionCount()) /
|
||||
(BB->size() - BB->getNumPseudos());
|
||||
}
|
||||
AvgFreq[I] = Freq;
|
||||
}
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::printClusters() const {
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
errs() << "Cluster number " << I;
|
||||
if (AvgFreq.size() == Clusters.size())
|
||||
errs() << " (frequency: " << AvgFreq[I] << ")";
|
||||
errs() << " : ";
|
||||
auto Sep = "";
|
||||
for (auto BB : Clusters[I]) {
|
||||
errs() << Sep << BB->getName();
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::reset() {
|
||||
Clusters.clear();
|
||||
ClusterEdges.clear();
|
||||
AvgFreq.clear();
|
||||
}
|
||||
|
||||
void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) {
|
||||
reset();
|
||||
|
||||
// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
|
||||
// maximize weight during a path traversing all BBs. In this way, we will
|
||||
// convert the hottest branches into fall-throughs.
|
||||
|
||||
// Encode an edge between two basic blocks, source and destination
|
||||
typedef std::pair<BinaryBasicBlock *, BinaryBasicBlock *> EdgeTy;
|
||||
std::map<EdgeTy, uint64_t> Weight;
|
||||
|
||||
// Define a comparison function to establish SWO between edges
|
||||
auto Comp = [&] (EdgeTy A, EdgeTy B) {
|
||||
// With equal weights, prioritize branches with lower index
|
||||
// source/destination. This helps to keep original block order for blocks
|
||||
// when optimal order cannot be deducted from a profile.
|
||||
if (Weight[A] == Weight[B]) {
|
||||
uint32_t ASrcBBIndex = BF.getIndex(A.first);
|
||||
uint32_t BSrcBBIndex = BF.getIndex(B.first);
|
||||
if (ASrcBBIndex != BSrcBBIndex)
|
||||
return ASrcBBIndex > BSrcBBIndex;
|
||||
return BF.getIndex(A.second) > BF.getIndex(B.second);
|
||||
}
|
||||
return Weight[A] < Weight[B];
|
||||
};
|
||||
std::priority_queue<EdgeTy, std::vector<EdgeTy>, decltype(Comp)> Queue(Comp);
|
||||
|
||||
typedef std::map<BinaryBasicBlock *, int> BBToClusterMapTy;
|
||||
BBToClusterMapTy BBToClusterMap;
|
||||
|
||||
ClusterEdges.resize(BF.layout_size());
|
||||
|
||||
for (auto BB : BF.layout()) {
|
||||
// Create a cluster for this BB
|
||||
uint32_t I = Clusters.size();
|
||||
Clusters.emplace_back();
|
||||
auto &Cluster = Clusters.back();
|
||||
Cluster.push_back(BB);
|
||||
BBToClusterMap[BB] = I;
|
||||
// Populate priority queue with edges
|
||||
auto BI = BB->branch_info_begin();
|
||||
for (auto &I : BB->successors()) {
|
||||
if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
|
||||
Weight[std::make_pair(BB, I)] = BI->Count;
|
||||
Queue.push(std::make_pair(BB, I));
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
// Grow clusters in a greedy fashion
|
||||
while (!Queue.empty()) {
|
||||
auto elmt = Queue.top();
|
||||
Queue.pop();
|
||||
|
||||
BinaryBasicBlock *BBSrc = elmt.first;
|
||||
BinaryBasicBlock *BBDst = elmt.second;
|
||||
|
||||
// Case 1: BBSrc and BBDst are the same. Ignore this edge
|
||||
if (BBSrc == BBDst || BBDst == *BF.layout_begin())
|
||||
continue;
|
||||
|
||||
int I = BBToClusterMap[BBSrc];
|
||||
int J = BBToClusterMap[BBDst];
|
||||
|
||||
// Case 2: If they are already allocated at the same cluster, just increase
|
||||
// the weight of this cluster
|
||||
if (I == J) {
|
||||
ClusterEdges[I][I] += Weight[elmt];
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &ClusterA = Clusters[I];
|
||||
auto &ClusterB = Clusters[J];
|
||||
if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
|
||||
// Case 3: BBSrc is at the end of a cluster and BBDst is at the start,
|
||||
// allowing us to merge two clusters
|
||||
for (auto BB : ClusterB)
|
||||
BBToClusterMap[BB] = I;
|
||||
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
|
||||
ClusterB.clear();
|
||||
// Iterate through all inter-cluster edges and transfer edges targeting
|
||||
// cluster B to cluster A.
|
||||
// It is bad to have to iterate though all edges when we could have a list
|
||||
// of predecessors for cluster B. However, it's not clear if it is worth
|
||||
// the added code complexity to create a data structure for clusters that
|
||||
// maintains a list of predecessors. Maybe change this if it becomes a
|
||||
// deal breaker.
|
||||
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
|
||||
ClusterEdges[K][I] += ClusterEdges[K][J];
|
||||
} else {
|
||||
// Case 4: Both BBSrc and BBDst are allocated in positions we cannot
|
||||
// merge them. Annotate the weight of this edge in the weight between
|
||||
// clusters to help us decide ordering between these clusters.
|
||||
ClusterEdges[I][J] += Weight[elmt];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void OptimalReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
std::vector<std::vector<uint64_t>> Weight;
|
||||
std::map<BinaryBasicBlock *, int> BBToIndex;
|
||||
std::vector<BinaryBasicBlock *> IndexToBB;
|
||||
|
||||
unsigned N = BF.layout_size();
|
||||
// Populating weight map and index map
|
||||
for (auto BB : BF.layout()) {
|
||||
BBToIndex[BB] = IndexToBB.size();
|
||||
IndexToBB.push_back(BB);
|
||||
}
|
||||
Weight.resize(N);
|
||||
for (auto BB : BF.layout()) {
|
||||
auto BI = BB->branch_info_begin();
|
||||
Weight[BBToIndex[BB]].resize(N);
|
||||
for (auto I : BB->successors()) {
|
||||
if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
|
||||
Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> DP;
|
||||
DP.resize(1 << N);
|
||||
for (auto &Elmt : DP) {
|
||||
Elmt.resize(N, -1);
|
||||
}
|
||||
// Start with the entry basic block being allocated with cost zero
|
||||
DP[1][0] = 0;
|
||||
// Walk through TSP solutions using a bitmask to represent state (current set
|
||||
// of BBs in the layout)
|
||||
unsigned BestSet = 1;
|
||||
unsigned BestLast = 0;
|
||||
int64_t BestWeight = 0;
|
||||
for (unsigned Set = 1; Set < (1U << N); ++Set) {
|
||||
// Traverse each possibility of Last BB visited in this layout
|
||||
for (unsigned Last = 0; Last < N; ++Last) {
|
||||
// Case 1: There is no possible layout with this BB as Last
|
||||
if (DP[Set][Last] == -1)
|
||||
continue;
|
||||
|
||||
// Case 2: There is a layout with this Set and this Last, and we try
|
||||
// to expand this set with New
|
||||
for (unsigned New = 1; New < N; ++New) {
|
||||
// Case 2a: BB "New" is already in this Set
|
||||
if ((Set & (1 << New)) != 0)
|
||||
continue;
|
||||
|
||||
// Case 2b: BB "New" is not in this set and we add it to this Set and
|
||||
// record total weight of this layout with "New" as the last BB.
|
||||
unsigned NewSet = (Set | (1 << New));
|
||||
if (DP[NewSet][New] == -1)
|
||||
DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
|
||||
DP[NewSet][New] = std::max(DP[NewSet][New],
|
||||
DP[Set][Last] + (int64_t)Weight[Last][New]);
|
||||
|
||||
if (DP[NewSet][New] > BestWeight) {
|
||||
BestWeight = DP[NewSet][New];
|
||||
BestSet = NewSet;
|
||||
BestLast = New;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Define final function layout based on layout that maximizes weight
|
||||
unsigned Last = BestLast;
|
||||
unsigned Set = BestSet;
|
||||
std::vector<bool> Visited;
|
||||
Visited.resize(N);
|
||||
Visited[Last] = true;
|
||||
Order.push_back(IndexToBB[Last]);
|
||||
Set = Set & ~(1U << Last);
|
||||
while (Set != 0) {
|
||||
int64_t Best = -1;
|
||||
for (unsigned I = 0; I < N; ++I) {
|
||||
if (DP[Set][I] == -1)
|
||||
continue;
|
||||
if (DP[Set][I] > Best) {
|
||||
Last = I;
|
||||
Best = DP[Set][I];
|
||||
}
|
||||
}
|
||||
Visited[Last] = true;
|
||||
Order.push_back(IndexToBB[Last]);
|
||||
Set = Set & ~(1U << Last);
|
||||
}
|
||||
std::reverse(Order.begin(), Order.end());
|
||||
|
||||
// Finalize layout with BBs that weren't assigned to the layout
|
||||
for (auto BB : BF.layout()) {
|
||||
if (Visited[BBToIndex[BB]] == false)
|
||||
Order.push_back(BB);
|
||||
}
|
||||
}
|
||||
|
||||
void OptimizeReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Arrange basic blocks according to clusters.
|
||||
for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
|
||||
void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;;
|
||||
std::vector<std::map<uint32_t, uint64_t>> &ClusterEdges = CAlgo->ClusterEdges;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Cluster layout order
|
||||
std::vector<uint32_t> ClusterOrder;
|
||||
|
||||
// Do a topological sort for clusters, prioritizing frequently-executed BBs
|
||||
// during the traversal.
|
||||
std::stack<uint32_t> Stack;
|
||||
std::vector<uint32_t> Status;
|
||||
std::vector<uint32_t> Parent;
|
||||
Status.resize(Clusters.size(), 0);
|
||||
Parent.resize(Clusters.size(), 0);
|
||||
constexpr uint32_t STACKED = 1;
|
||||
constexpr uint32_t VISITED = 2;
|
||||
Status[0] = STACKED;
|
||||
Stack.push(0);
|
||||
while (!Stack.empty()) {
|
||||
uint32_t I = Stack.top();
|
||||
if (!(Status[I] & VISITED)) {
|
||||
Status[I] |= VISITED;
|
||||
// Order successors by weight
|
||||
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
|
||||
return ClusterEdges[I][A] > ClusterEdges[I][B];
|
||||
};
|
||||
std::priority_queue<uint32_t, std::vector<uint32_t>,
|
||||
decltype(ClusterComp)> SuccQueue(ClusterComp);
|
||||
for (auto &Target: ClusterEdges[I]) {
|
||||
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
|
||||
!Clusters[Target.first].empty()) {
|
||||
Parent[Target.first] = I;
|
||||
Status[Target.first] = STACKED;
|
||||
SuccQueue.push(Target.first);
|
||||
}
|
||||
}
|
||||
while (!SuccQueue.empty()) {
|
||||
Stack.push(SuccQueue.top());
|
||||
SuccQueue.pop();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Already visited this node
|
||||
Stack.pop();
|
||||
ClusterOrder.push_back(I);
|
||||
}
|
||||
std::reverse(ClusterOrder.begin(), ClusterOrder.end());
|
||||
// Put unreachable clusters at the end
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!(Status[I] & VISITED) && !Clusters[I].empty())
|
||||
ClusterOrder.push_back(I);
|
||||
|
||||
// Sort nodes with equal precedence
|
||||
auto Beg = ClusterOrder.begin();
|
||||
// Don't reorder the first cluster, which contains the function entry point
|
||||
++Beg;
|
||||
std::stable_sort(Beg, ClusterOrder.end(),
|
||||
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
|
||||
uint32_t P = Parent[A];
|
||||
while (Parent[P] != 0) {
|
||||
if (Parent[P] == B)
|
||||
return false;
|
||||
P = Parent[P];
|
||||
}
|
||||
P = Parent[B];
|
||||
while (Parent[P] != 0) {
|
||||
if (Parent[P] == A)
|
||||
return true;
|
||||
P = Parent[P];
|
||||
}
|
||||
return AvgFreq[A] > AvgFreq[B];
|
||||
});
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
errs() << "New cluster order: ";
|
||||
auto Sep = "";
|
||||
for (auto O : ClusterOrder) {
|
||||
errs() << Sep << O;
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << '\n';
|
||||
}
|
||||
|
||||
// Arrange basic blocks according to cluster order.
|
||||
for (uint32_t ClusterIndex : ClusterOrder) {
|
||||
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
}
|
||||
|
||||
void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Cluster layout order
|
||||
std::vector<uint32_t> ClusterOrder;
|
||||
|
||||
// Order clusters based on average instruction execution frequency
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!Clusters[I].empty())
|
||||
ClusterOrder.push_back(I);
|
||||
auto Beg = ClusterOrder.begin();
|
||||
// Don't reorder the first cluster, which contains the function entry point
|
||||
++Beg;
|
||||
std::stable_sort(Beg, ClusterOrder.end(), [&AvgFreq](uint32_t A, uint32_t B) {
|
||||
return AvgFreq[A] > AvgFreq[B];
|
||||
});
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
errs() << "New cluster order: ";
|
||||
auto Sep = "";
|
||||
for (auto O : ClusterOrder) {
|
||||
errs() << Sep << O;
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << '\n';
|
||||
}
|
||||
|
||||
// Arrange basic blocks according to cluster order.
|
||||
for (uint32_t ClusterIndex : ClusterOrder) {
|
||||
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
}
|
||||
|
||||
void ReverseReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
auto FirstBB = *BF.layout_begin();
|
||||
Order.push_back(FirstBB);
|
||||
for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
|
||||
Order.push_back(*RLI);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
//===- ReorderAlgorithm.h - Interface for basic block reorderng algorithms ===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Interface to different basic block reordering algorithms.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H
|
||||
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
|
||||
class BinaryBasicBlock;
|
||||
class BinaryFunction;
|
||||
|
||||
/// Objects of this class implement various basic block clustering algorithms.
|
||||
/// Basic block clusters are chains of basic blocks that should be laid out
|
||||
/// in this order to maximize performace. These algorithms group basic blocks
|
||||
/// into clusters using execution profile data and various heuristics.
|
||||
class ClusterAlgorithm {
|
||||
public:
|
||||
typedef std::vector<BinaryBasicBlock *> ClusterTy;
|
||||
std::vector<ClusterTy> Clusters;
|
||||
std::vector<std::map<uint32_t, uint64_t>> ClusterEdges;
|
||||
std::vector<double> AvgFreq;
|
||||
|
||||
/// Group the basic blocks the given function into clusters stored in the
|
||||
/// Clusters vector. Also encode relative weights between two clusters in
|
||||
/// the ClusterEdges vector. This vector is indexed by the clusters indices
|
||||
/// in the Clusters vector.
|
||||
virtual void clusterBasicBlocks(const BinaryFunction &BF) =0;
|
||||
|
||||
/// Compute for each cluster its averagae execution frequency, that is
|
||||
/// the sum of average frequencies of its blocks (execution count / # instrs).
|
||||
/// The average frequencies are stored in the AvgFreq vector, index by the
|
||||
/// cluster indices in the Clusters vector.
|
||||
void computeClusterAverageFrequency();
|
||||
|
||||
/// Clear clusters and related info.
|
||||
void reset();
|
||||
|
||||
void printClusters() const;
|
||||
|
||||
virtual ~ClusterAlgorithm() { }
|
||||
};
|
||||
|
||||
|
||||
/// This clustering algorithm is based on a greedy heuristic suggested by
|
||||
/// Pettis (PLDI '90).
|
||||
class GreedyClusterAlgorithm : public ClusterAlgorithm {
|
||||
public:
|
||||
void clusterBasicBlocks(const BinaryFunction &BF) override;
|
||||
};
|
||||
|
||||
/// Objects of this class implement various basic block reordering alogrithms.
|
||||
/// Most of these algorithms depend on a clustering alogrithm.
|
||||
/// Here we have 3 conflicting goals as to how to layout clusters. If we want
|
||||
/// to minimize jump offsets, we should put clusters with heavy inter-cluster
|
||||
/// dependence as close as possible. If we want to maximize the probability
|
||||
/// that all inter-cluster edges are predicted as not-taken, we should enforce
|
||||
/// a topological order to make targets appear after sources, creating forward
|
||||
/// branches. If we want to separate hot from cold blocks to maximize the
|
||||
/// probability that unfrequently executed code doesn't pollute the cache, we
|
||||
/// should put clusters in descending order of hotness.
|
||||
class ReorderAlgorithm {
|
||||
protected:
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo;
|
||||
|
||||
public:
|
||||
ReorderAlgorithm() { }
|
||||
explicit ReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
CAlgo(std::move(CAlgo)) { }
|
||||
|
||||
typedef std::vector<BinaryBasicBlock *> BasicBlockOrder;
|
||||
|
||||
/// Reorder the basic blocks of the given function and store the new order in
|
||||
/// the new Clusters vector.
|
||||
virtual void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const =0;
|
||||
|
||||
void setClusterAlgorithm(ClusterAlgorithm *CAlgo) {
|
||||
this->CAlgo.reset(CAlgo);
|
||||
}
|
||||
|
||||
virtual ~ReorderAlgorithm() { }
|
||||
};
|
||||
|
||||
|
||||
/// Dynamic programming implementation for the TSP, applied to BB layout. Find
|
||||
/// the optimal way to maximize weight during a path traversing all BBs. In
|
||||
/// this way, we will convert the hottest branches into fall-throughs.
|
||||
///
|
||||
/// Uses exponential amount of memory on the number of basic blocks and should
|
||||
/// only be used for small functions.
|
||||
class OptimalReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// Simple algorithm that groups basic blocks into clusters and then
|
||||
/// lays them out cluster after cluster.
|
||||
class OptimizeReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit OptimizeReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// This reorder algorithm tries to ensure that all inter-cluster edges are
|
||||
/// predicted as not-taken, by enforcing a topological order to make
|
||||
/// targets appear after sources, creating forward branches.
|
||||
class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit OptimizeBranchReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// This reorder tries to separate hot from cold blocks to maximize the
|
||||
/// probability that unfrequently executed code doesn't pollute the cache, by
|
||||
/// putting clusters in descending order of hotness.
|
||||
class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit OptimizeCacheReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// Toy example that simply reverses the original basic block order.
|
||||
class ReverseReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue