[BOLT] merging cold basic blocks to reduce #jumps

Summary:
This diff introduces a modification of cache+ block ordering algorithm,
which reordered and merges cold blocks in a function with the goal of reducing
the number of (non-fallthrough) jumps, and thus, the code size.

(cherry picked from FBD8044978)
This commit is contained in:
spupyrev 2018-05-17 11:14:15 -07:00 committed by Maksim Panchenko
parent b4dbd35d6c
commit 779541283a
4 changed files with 111 additions and 92 deletions

View File

@ -481,7 +481,7 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
break; break;
case LT_OPTIMIZE_CACHE_PLUS: case LT_OPTIMIZE_CACHE_PLUS:
Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo))); Algo.reset(new CachePlusReorderAlgorithm());
break; break;
case LT_OPTIMIZE_SHUFFLE: case LT_OPTIMIZE_SHUFFLE:

View File

@ -82,7 +82,7 @@ public:
return Blocks; return Blocks;
} }
/// Update the list of basic blocks and meta-info /// Update the list of basic blocks and aggregated cluster data
void merge(const Cluster *Other, void merge(const Cluster *Other,
const std::vector<BinaryBasicBlock *> &MergedBlocks, const std::vector<BinaryBasicBlock *> &MergedBlocks,
double MergedScore) { double MergedScore) {
@ -93,6 +93,10 @@ public:
Score = MergedScore; Score = MergedScore;
} }
void clear() {
Blocks.clear();
}
private: private:
std::vector<BinaryBasicBlock *> Blocks; std::vector<BinaryBasicBlock *> Blocks;
size_t Id; size_t Id;
@ -219,65 +223,14 @@ public:
/// Run cache+ algorithm and return a basic block ordering /// Run cache+ algorithm and return a basic block ordering
std::vector<BinaryBasicBlock *> run() { std::vector<BinaryBasicBlock *> run() {
// Merge blocks with their fallthrough successors // Pass 1: Merge blocks with their fallthrough successors
for (auto BB : BF.layout()) { mergeFallthroughs();
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
auto CurBB = BB;
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
mergeClusters(&AllClusters[BB->getLayoutIndex()],
&AllClusters[NextBB->getLayoutIndex()],
0);
CurBB = NextBB;
}
}
}
// Merge pairs of clusters while there is an improvement in ExtTSP metric // Pass 2: Merge pairs of clusters while improving the ExtTSP metric
while (Clusters.size() > 1) { mergeClusterPairs();
Cluster *BestClusterPred = nullptr;
Cluster *BestClusterSucc = nullptr;
std::pair<double, size_t> BestGain(-1, 0);
for (auto ClusterPred : Clusters) {
// Do not merge cold blocks
if (ClusterPred->isCold())
continue;
// Get candidates for merging with the current cluster // Pass 3: Merge cold blocks to reduce code size
Adjacent.forAllAdjacent( mergeColdClusters();
ClusterPred,
// Find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
// Compute the gain of merging two clusters
auto Gain = mergeGain(ClusterPred, ClusterSucc);
if (Gain.first <= 0.0)
return;
// Breaking ties by density to make the hottest clusters be merged first
if (Gain.first > BestGain.first ||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
compareClusterPairs(ClusterPred,
ClusterSucc,
BestClusterPred,
BestClusterSucc))) {
BestGain = Gain;
BestClusterPred = ClusterPred;
BestClusterSucc = ClusterSucc;
}
});
}
// Stop merging when there is no improvement
if (BestGain.first <= 0.0)
break;
// Merge the best pair of clusters
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
}
// Sorting clusters by density // Sorting clusters by density
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
@ -339,12 +292,14 @@ private:
// Initialize clusters // Initialize clusters
Clusters.reserve(BF.layout_size()); Clusters.reserve(BF.layout_size());
AllClusters.reserve(BF.layout_size()); AllClusters.reserve(BF.layout_size());
CurCluster.reserve(BF.layout_size());
Size.reserve(BF.layout_size()); Size.reserve(BF.layout_size());
for (auto BB : BF.layout()) { for (auto BB : BF.layout()) {
size_t Index = BB->getLayoutIndex(); size_t Index = BB->getLayoutIndex();
Size.push_back(std::max(BB->estimateSize(), size_t(1))); Size.push_back(std::max(BB->estimateSize(), size_t(1)));
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]); AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
Clusters.push_back(&AllClusters[Index]); Clusters.push_back(&AllClusters[Index]);
CurCluster.push_back(&AllClusters[Index]);
} }
// Initialize adjacency matrix // Initialize adjacency matrix
@ -364,6 +319,88 @@ private:
findFallthroughBlocks(InWeight, OutWeight); findFallthroughBlocks(InWeight, OutWeight);
} }
/// Merge blocks with their fallthrough successors.
void mergeFallthroughs() {
for (auto BB : BF.layout()) {
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
auto CurBB = BB;
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
mergeClusters(&AllClusters[BB->getLayoutIndex()],
&AllClusters[NextBB->getLayoutIndex()],
0);
CurBB = NextBB;
}
}
}
}
/// Merge pairs of clusters while improving the ExtTSP metric
void mergeClusterPairs() {
while (Clusters.size() > 1) {
Cluster *BestClusterPred = nullptr;
Cluster *BestClusterSucc = nullptr;
std::pair<double, size_t> BestGain(-1, 0);
for (auto ClusterPred : Clusters) {
// Do not merge cold blocks
if (ClusterPred->isCold())
continue;
// Get candidates for merging with the current cluster
Adjacent.forAllAdjacent(
ClusterPred,
// Find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
// Compute the gain of merging two clusters
auto Gain = mergeGain(ClusterPred, ClusterSucc);
if (Gain.first <= 0.0)
return;
// Breaking ties by density to make the hottest clusters be merged first
if (Gain.first > BestGain.first ||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
compareClusterPairs(ClusterPred,
ClusterSucc,
BestClusterPred,
BestClusterSucc))) {
BestGain = Gain;
BestClusterPred = ClusterPred;
BestClusterSucc = ClusterSucc;
}
});
}
// Stop merging when there is no improvement
if (BestGain.first <= 0.0)
break;
// Merge the best pair of clusters
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
}
}
/// Merge cold blocks to reduce code size
void mergeColdClusters() {
for (auto SrcBB : BF.layout()) {
// Iterating in reverse order to make sure original fall-trough jumps are
// merged first
for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) {
BinaryBasicBlock *DstBB = *Itr;
auto SrcCluster = CurCluster[SrcBB->getLayoutIndex()];
auto DstCluster = CurCluster[DstBB->getLayoutIndex()];
if (SrcCluster != DstCluster && !DstCluster->isEntryPoint() &&
SrcCluster->blocks().back() == SrcBB &&
DstCluster->blocks().front() == DstBB) {
mergeClusters(SrcCluster, DstCluster, 0);
}
}
}
}
/// For a pair of blocks, A and B, block B is the fallthrough successor of A, /// For a pair of blocks, A and B, block B is the fallthrough successor of A,
/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
/// to B are from A. Such blocks should be adjacent in an optimal ordering, /// to B are from A. Such blocks should be adjacent in an optimal ordering,
@ -558,11 +595,17 @@ private:
// Merge the blocks of clusters // Merge the blocks of clusters
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType); auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks)); Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
From->clear();
// Remove cluster From from the list of active clusters // Remove cluster From from the list of active clusters
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
Clusters.erase(Iter, Clusters.end()); Clusters.erase(Iter, Clusters.end());
// Update block clusters
for (auto BB : Into->blocks()) {
CurCluster[BB->getLayoutIndex()] = Into;
}
// Invalidate caches // Invalidate caches
Cache.invalidate(Into); Cache.invalidate(Into);
@ -582,6 +625,9 @@ private:
// Active clusters. The vector gets udpated at runtime when clusters are merged // Active clusters. The vector gets udpated at runtime when clusters are merged
std::vector<Cluster *> Clusters; std::vector<Cluster *> Clusters;
// Current cluster of a basic block
std::vector<Cluster *> CurCluster;
// Size of the block // Size of the block
std::vector<uint64_t> Size; std::vector<uint64_t> Size;

View File

@ -9,24 +9,6 @@
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// TODO: copyright/license msg.
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "BinaryFunction.h" #include "BinaryFunction.h"
#include "HFSort.h" #include "HFSort.h"
#include "ReorderUtils.h" #include "ReorderUtils.h"
@ -112,14 +94,6 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
return A1->target(0) < A2->target(0); return A1->target(0) < A2->target(0);
} }
/// Sorting clusters by their density in decreasing order.
template <typename C>
std::vector<Cluster *> sortByDensity(const C &Clusters_) {
std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
return Clusters;
}
/// HFSortPlus - layout of hot functions with iTLB cache optimization /// HFSortPlus - layout of hot functions with iTLB cache optimization
/// ///
/// Given an ordering of hot functions (and hence, their assignment to the /// Given an ordering of hot functions (and hence, their assignment to the
@ -398,15 +372,17 @@ public:
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n"); DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
// Sorting clusters by density in decreasing order
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
// Return the set of clusters that are left, which are the ones that // Return the set of clusters that are left, which are the ones that
// didn't get merged (so their first func is its original func) // didn't get merged (so their first func is its original func)
std::vector<Cluster> Result; std::vector<Cluster> Result;
for (auto Cluster : sortByDensity(Clusters)) { Result.reserve(Clusters.size());
for (auto Cluster : Clusters) {
Result.emplace_back(std::move(*Cluster)); Result.emplace_back(std::move(*Cluster));
} }
assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity));
return Result; return Result;
} }
@ -473,6 +449,7 @@ private:
Adjacent.merge(Into, From); Adjacent.merge(Into, From);
Into->merge(*From); Into->merge(*From);
From->clear();
// Update the clusters and addresses for functions merged from From. // Update the clusters and addresses for functions merged from From.
size_t CurAddr = 0; size_t CurAddr = 0;

View File

@ -246,10 +246,6 @@ public:
/// A new reordering algorithm for basic blocks, cache+ /// A new reordering algorithm for basic blocks, cache+
class CachePlusReorderAlgorithm : public ReorderAlgorithm { class CachePlusReorderAlgorithm : public ReorderAlgorithm {
public: public:
explicit CachePlusReorderAlgorithm(
std::unique_ptr<ClusterAlgorithm> CAlgo) :
ReorderAlgorithm(std::move(CAlgo)) { }
void reorderBasicBlocks( void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override; const BinaryFunction &BF, BasicBlockOrder &Order) const override;
}; };