forked from OSchip/llvm-project
[BOLT] merging cold basic blocks to reduce #jumps
Summary: This diff introduces a modification of cache+ block ordering algorithm, which reordered and merges cold blocks in a function with the goal of reducing the number of (non-fallthrough) jumps, and thus, the code size. (cherry picked from FBD8044978)
This commit is contained in:
parent
b4dbd35d6c
commit
779541283a
|
@ -481,7 +481,7 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
|
|||
break;
|
||||
|
||||
case LT_OPTIMIZE_CACHE_PLUS:
|
||||
Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo)));
|
||||
Algo.reset(new CachePlusReorderAlgorithm());
|
||||
break;
|
||||
|
||||
case LT_OPTIMIZE_SHUFFLE:
|
||||
|
|
|
@ -82,7 +82,7 @@ public:
|
|||
return Blocks;
|
||||
}
|
||||
|
||||
/// Update the list of basic blocks and meta-info
|
||||
/// Update the list of basic blocks and aggregated cluster data
|
||||
void merge(const Cluster *Other,
|
||||
const std::vector<BinaryBasicBlock *> &MergedBlocks,
|
||||
double MergedScore) {
|
||||
|
@ -93,6 +93,10 @@ public:
|
|||
Score = MergedScore;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
Blocks.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<BinaryBasicBlock *> Blocks;
|
||||
size_t Id;
|
||||
|
@ -219,65 +223,14 @@ public:
|
|||
|
||||
/// Run cache+ algorithm and return a basic block ordering
|
||||
std::vector<BinaryBasicBlock *> run() {
|
||||
// Merge blocks with their fallthrough successors
|
||||
for (auto BB : BF.layout()) {
|
||||
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
|
||||
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
|
||||
auto CurBB = BB;
|
||||
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
|
||||
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
|
||||
mergeClusters(&AllClusters[BB->getLayoutIndex()],
|
||||
&AllClusters[NextBB->getLayoutIndex()],
|
||||
0);
|
||||
CurBB = NextBB;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Pass 1: Merge blocks with their fallthrough successors
|
||||
mergeFallthroughs();
|
||||
|
||||
// Merge pairs of clusters while there is an improvement in ExtTSP metric
|
||||
while (Clusters.size() > 1) {
|
||||
Cluster *BestClusterPred = nullptr;
|
||||
Cluster *BestClusterSucc = nullptr;
|
||||
std::pair<double, size_t> BestGain(-1, 0);
|
||||
for (auto ClusterPred : Clusters) {
|
||||
// Do not merge cold blocks
|
||||
if (ClusterPred->isCold())
|
||||
continue;
|
||||
// Pass 2: Merge pairs of clusters while improving the ExtTSP metric
|
||||
mergeClusterPairs();
|
||||
|
||||
// Get candidates for merging with the current cluster
|
||||
Adjacent.forAllAdjacent(
|
||||
ClusterPred,
|
||||
// Find the best candidate
|
||||
[&](Cluster *ClusterSucc) {
|
||||
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
||||
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
|
||||
|
||||
// Compute the gain of merging two clusters
|
||||
auto Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||
if (Gain.first <= 0.0)
|
||||
return;
|
||||
|
||||
// Breaking ties by density to make the hottest clusters be merged first
|
||||
if (Gain.first > BestGain.first ||
|
||||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
|
||||
compareClusterPairs(ClusterPred,
|
||||
ClusterSucc,
|
||||
BestClusterPred,
|
||||
BestClusterSucc))) {
|
||||
BestGain = Gain;
|
||||
BestClusterPred = ClusterPred;
|
||||
BestClusterSucc = ClusterSucc;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Stop merging when there is no improvement
|
||||
if (BestGain.first <= 0.0)
|
||||
break;
|
||||
|
||||
// Merge the best pair of clusters
|
||||
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
|
||||
}
|
||||
// Pass 3: Merge cold blocks to reduce code size
|
||||
mergeColdClusters();
|
||||
|
||||
// Sorting clusters by density
|
||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||
|
@ -339,12 +292,14 @@ private:
|
|||
// Initialize clusters
|
||||
Clusters.reserve(BF.layout_size());
|
||||
AllClusters.reserve(BF.layout_size());
|
||||
CurCluster.reserve(BF.layout_size());
|
||||
Size.reserve(BF.layout_size());
|
||||
for (auto BB : BF.layout()) {
|
||||
size_t Index = BB->getLayoutIndex();
|
||||
Size.push_back(std::max(BB->estimateSize(), size_t(1)));
|
||||
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
|
||||
Clusters.push_back(&AllClusters[Index]);
|
||||
CurCluster.push_back(&AllClusters[Index]);
|
||||
}
|
||||
|
||||
// Initialize adjacency matrix
|
||||
|
@ -364,6 +319,88 @@ private:
|
|||
findFallthroughBlocks(InWeight, OutWeight);
|
||||
}
|
||||
|
||||
/// Merge blocks with their fallthrough successors.
|
||||
void mergeFallthroughs() {
|
||||
for (auto BB : BF.layout()) {
|
||||
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
|
||||
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
|
||||
auto CurBB = BB;
|
||||
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
|
||||
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
|
||||
mergeClusters(&AllClusters[BB->getLayoutIndex()],
|
||||
&AllClusters[NextBB->getLayoutIndex()],
|
||||
0);
|
||||
CurBB = NextBB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge pairs of clusters while improving the ExtTSP metric
|
||||
void mergeClusterPairs() {
|
||||
while (Clusters.size() > 1) {
|
||||
Cluster *BestClusterPred = nullptr;
|
||||
Cluster *BestClusterSucc = nullptr;
|
||||
std::pair<double, size_t> BestGain(-1, 0);
|
||||
for (auto ClusterPred : Clusters) {
|
||||
// Do not merge cold blocks
|
||||
if (ClusterPred->isCold())
|
||||
continue;
|
||||
|
||||
// Get candidates for merging with the current cluster
|
||||
Adjacent.forAllAdjacent(
|
||||
ClusterPred,
|
||||
// Find the best candidate
|
||||
[&](Cluster *ClusterSucc) {
|
||||
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
||||
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
|
||||
|
||||
// Compute the gain of merging two clusters
|
||||
auto Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||
if (Gain.first <= 0.0)
|
||||
return;
|
||||
|
||||
// Breaking ties by density to make the hottest clusters be merged first
|
||||
if (Gain.first > BestGain.first ||
|
||||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
|
||||
compareClusterPairs(ClusterPred,
|
||||
ClusterSucc,
|
||||
BestClusterPred,
|
||||
BestClusterSucc))) {
|
||||
BestGain = Gain;
|
||||
BestClusterPred = ClusterPred;
|
||||
BestClusterSucc = ClusterSucc;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Stop merging when there is no improvement
|
||||
if (BestGain.first <= 0.0)
|
||||
break;
|
||||
|
||||
// Merge the best pair of clusters
|
||||
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge cold blocks to reduce code size
|
||||
void mergeColdClusters() {
|
||||
for (auto SrcBB : BF.layout()) {
|
||||
// Iterating in reverse order to make sure original fall-trough jumps are
|
||||
// merged first
|
||||
for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) {
|
||||
BinaryBasicBlock *DstBB = *Itr;
|
||||
auto SrcCluster = CurCluster[SrcBB->getLayoutIndex()];
|
||||
auto DstCluster = CurCluster[DstBB->getLayoutIndex()];
|
||||
if (SrcCluster != DstCluster && !DstCluster->isEntryPoint() &&
|
||||
SrcCluster->blocks().back() == SrcBB &&
|
||||
DstCluster->blocks().front() == DstBB) {
|
||||
mergeClusters(SrcCluster, DstCluster, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// For a pair of blocks, A and B, block B is the fallthrough successor of A,
|
||||
/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
|
||||
/// to B are from A. Such blocks should be adjacent in an optimal ordering,
|
||||
|
@ -558,11 +595,17 @@ private:
|
|||
// Merge the blocks of clusters
|
||||
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
|
||||
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
|
||||
From->clear();
|
||||
|
||||
// Remove cluster From from the list of active clusters
|
||||
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
|
||||
Clusters.erase(Iter, Clusters.end());
|
||||
|
||||
// Update block clusters
|
||||
for (auto BB : Into->blocks()) {
|
||||
CurCluster[BB->getLayoutIndex()] = Into;
|
||||
}
|
||||
|
||||
// Invalidate caches
|
||||
Cache.invalidate(Into);
|
||||
|
||||
|
@ -582,6 +625,9 @@ private:
|
|||
// Active clusters. The vector gets udpated at runtime when clusters are merged
|
||||
std::vector<Cluster *> Clusters;
|
||||
|
||||
// Current cluster of a basic block
|
||||
std::vector<Cluster *> CurCluster;
|
||||
|
||||
// Size of the block
|
||||
std::vector<uint64_t> Size;
|
||||
|
||||
|
|
|
@ -9,24 +9,6 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// TODO: copyright/license msg.
|
||||
|
||||
/*
|
||||
+----------------------------------------------------------------------+
|
||||
| HipHop for PHP |
|
||||
+----------------------------------------------------------------------+
|
||||
| Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
|
||||
+----------------------------------------------------------------------+
|
||||
| This source file is subject to version 3.01 of the PHP license, |
|
||||
| that is bundled with this package in the file LICENSE, and is |
|
||||
| available through the world-wide-web at the following url: |
|
||||
| http://www.php.net/license/3_01.txt |
|
||||
| If you did not receive a copy of the PHP license and are unable to |
|
||||
| obtain it through the world-wide-web, please send a note to |
|
||||
| license@php.net so we can mail you a copy immediately. |
|
||||
+----------------------------------------------------------------------+
|
||||
*/
|
||||
|
||||
#include "BinaryFunction.h"
|
||||
#include "HFSort.h"
|
||||
#include "ReorderUtils.h"
|
||||
|
@ -112,14 +94,6 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
|
|||
return A1->target(0) < A2->target(0);
|
||||
}
|
||||
|
||||
/// Sorting clusters by their density in decreasing order.
|
||||
template <typename C>
|
||||
std::vector<Cluster *> sortByDensity(const C &Clusters_) {
|
||||
std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
|
||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||
return Clusters;
|
||||
}
|
||||
|
||||
/// HFSortPlus - layout of hot functions with iTLB cache optimization
|
||||
///
|
||||
/// Given an ordering of hot functions (and hence, their assignment to the
|
||||
|
@ -398,15 +372,17 @@ public:
|
|||
|
||||
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
|
||||
|
||||
// Sorting clusters by density in decreasing order
|
||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||
|
||||
// Return the set of clusters that are left, which are the ones that
|
||||
// didn't get merged (so their first func is its original func)
|
||||
std::vector<Cluster> Result;
|
||||
for (auto Cluster : sortByDensity(Clusters)) {
|
||||
Result.reserve(Clusters.size());
|
||||
for (auto Cluster : Clusters) {
|
||||
Result.emplace_back(std::move(*Cluster));
|
||||
}
|
||||
|
||||
assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity));
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
|
@ -473,6 +449,7 @@ private:
|
|||
Adjacent.merge(Into, From);
|
||||
|
||||
Into->merge(*From);
|
||||
From->clear();
|
||||
|
||||
// Update the clusters and addresses for functions merged from From.
|
||||
size_t CurAddr = 0;
|
||||
|
|
|
@ -246,10 +246,6 @@ public:
|
|||
/// A new reordering algorithm for basic blocks, cache+
|
||||
class CachePlusReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit CachePlusReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue