forked from OSchip/llvm-project
[BOLT] merging cold basic blocks to reduce #jumps
Summary: This diff introduces a modification of cache+ block ordering algorithm, which reordered and merges cold blocks in a function with the goal of reducing the number of (non-fallthrough) jumps, and thus, the code size. (cherry picked from FBD8044978)
This commit is contained in:
parent
b4dbd35d6c
commit
779541283a
|
@ -481,7 +481,7 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case LT_OPTIMIZE_CACHE_PLUS:
|
case LT_OPTIMIZE_CACHE_PLUS:
|
||||||
Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo)));
|
Algo.reset(new CachePlusReorderAlgorithm());
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case LT_OPTIMIZE_SHUFFLE:
|
case LT_OPTIMIZE_SHUFFLE:
|
||||||
|
|
|
@ -82,7 +82,7 @@ public:
|
||||||
return Blocks;
|
return Blocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update the list of basic blocks and meta-info
|
/// Update the list of basic blocks and aggregated cluster data
|
||||||
void merge(const Cluster *Other,
|
void merge(const Cluster *Other,
|
||||||
const std::vector<BinaryBasicBlock *> &MergedBlocks,
|
const std::vector<BinaryBasicBlock *> &MergedBlocks,
|
||||||
double MergedScore) {
|
double MergedScore) {
|
||||||
|
@ -93,6 +93,10 @@ public:
|
||||||
Score = MergedScore;
|
Score = MergedScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clear() {
|
||||||
|
Blocks.clear();
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<BinaryBasicBlock *> Blocks;
|
std::vector<BinaryBasicBlock *> Blocks;
|
||||||
size_t Id;
|
size_t Id;
|
||||||
|
@ -219,65 +223,14 @@ public:
|
||||||
|
|
||||||
/// Run cache+ algorithm and return a basic block ordering
|
/// Run cache+ algorithm and return a basic block ordering
|
||||||
std::vector<BinaryBasicBlock *> run() {
|
std::vector<BinaryBasicBlock *> run() {
|
||||||
// Merge blocks with their fallthrough successors
|
// Pass 1: Merge blocks with their fallthrough successors
|
||||||
for (auto BB : BF.layout()) {
|
mergeFallthroughs();
|
||||||
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
|
|
||||||
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
|
|
||||||
auto CurBB = BB;
|
|
||||||
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
|
|
||||||
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
|
|
||||||
mergeClusters(&AllClusters[BB->getLayoutIndex()],
|
|
||||||
&AllClusters[NextBB->getLayoutIndex()],
|
|
||||||
0);
|
|
||||||
CurBB = NextBB;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merge pairs of clusters while there is an improvement in ExtTSP metric
|
// Pass 2: Merge pairs of clusters while improving the ExtTSP metric
|
||||||
while (Clusters.size() > 1) {
|
mergeClusterPairs();
|
||||||
Cluster *BestClusterPred = nullptr;
|
|
||||||
Cluster *BestClusterSucc = nullptr;
|
|
||||||
std::pair<double, size_t> BestGain(-1, 0);
|
|
||||||
for (auto ClusterPred : Clusters) {
|
|
||||||
// Do not merge cold blocks
|
|
||||||
if (ClusterPred->isCold())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// Get candidates for merging with the current cluster
|
// Pass 3: Merge cold blocks to reduce code size
|
||||||
Adjacent.forAllAdjacent(
|
mergeColdClusters();
|
||||||
ClusterPred,
|
|
||||||
// Find the best candidate
|
|
||||||
[&](Cluster *ClusterSucc) {
|
|
||||||
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
|
||||||
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
|
|
||||||
|
|
||||||
// Compute the gain of merging two clusters
|
|
||||||
auto Gain = mergeGain(ClusterPred, ClusterSucc);
|
|
||||||
if (Gain.first <= 0.0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Breaking ties by density to make the hottest clusters be merged first
|
|
||||||
if (Gain.first > BestGain.first ||
|
|
||||||
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
|
|
||||||
compareClusterPairs(ClusterPred,
|
|
||||||
ClusterSucc,
|
|
||||||
BestClusterPred,
|
|
||||||
BestClusterSucc))) {
|
|
||||||
BestGain = Gain;
|
|
||||||
BestClusterPred = ClusterPred;
|
|
||||||
BestClusterSucc = ClusterSucc;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop merging when there is no improvement
|
|
||||||
if (BestGain.first <= 0.0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Merge the best pair of clusters
|
|
||||||
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sorting clusters by density
|
// Sorting clusters by density
|
||||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||||
|
@ -339,12 +292,14 @@ private:
|
||||||
// Initialize clusters
|
// Initialize clusters
|
||||||
Clusters.reserve(BF.layout_size());
|
Clusters.reserve(BF.layout_size());
|
||||||
AllClusters.reserve(BF.layout_size());
|
AllClusters.reserve(BF.layout_size());
|
||||||
|
CurCluster.reserve(BF.layout_size());
|
||||||
Size.reserve(BF.layout_size());
|
Size.reserve(BF.layout_size());
|
||||||
for (auto BB : BF.layout()) {
|
for (auto BB : BF.layout()) {
|
||||||
size_t Index = BB->getLayoutIndex();
|
size_t Index = BB->getLayoutIndex();
|
||||||
Size.push_back(std::max(BB->estimateSize(), size_t(1)));
|
Size.push_back(std::max(BB->estimateSize(), size_t(1)));
|
||||||
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
|
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
|
||||||
Clusters.push_back(&AllClusters[Index]);
|
Clusters.push_back(&AllClusters[Index]);
|
||||||
|
CurCluster.push_back(&AllClusters[Index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize adjacency matrix
|
// Initialize adjacency matrix
|
||||||
|
@ -364,6 +319,88 @@ private:
|
||||||
findFallthroughBlocks(InWeight, OutWeight);
|
findFallthroughBlocks(InWeight, OutWeight);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Merge blocks with their fallthrough successors.
|
||||||
|
void mergeFallthroughs() {
|
||||||
|
for (auto BB : BF.layout()) {
|
||||||
|
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
|
||||||
|
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
|
||||||
|
auto CurBB = BB;
|
||||||
|
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
|
||||||
|
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
|
||||||
|
mergeClusters(&AllClusters[BB->getLayoutIndex()],
|
||||||
|
&AllClusters[NextBB->getLayoutIndex()],
|
||||||
|
0);
|
||||||
|
CurBB = NextBB;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge pairs of clusters while improving the ExtTSP metric
|
||||||
|
void mergeClusterPairs() {
|
||||||
|
while (Clusters.size() > 1) {
|
||||||
|
Cluster *BestClusterPred = nullptr;
|
||||||
|
Cluster *BestClusterSucc = nullptr;
|
||||||
|
std::pair<double, size_t> BestGain(-1, 0);
|
||||||
|
for (auto ClusterPred : Clusters) {
|
||||||
|
// Do not merge cold blocks
|
||||||
|
if (ClusterPred->isCold())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Get candidates for merging with the current cluster
|
||||||
|
Adjacent.forAllAdjacent(
|
||||||
|
ClusterPred,
|
||||||
|
// Find the best candidate
|
||||||
|
[&](Cluster *ClusterSucc) {
|
||||||
|
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
|
||||||
|
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
|
||||||
|
|
||||||
|
// Compute the gain of merging two clusters
|
||||||
|
auto Gain = mergeGain(ClusterPred, ClusterSucc);
|
||||||
|
if (Gain.first <= 0.0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Breaking ties by density to make the hottest clusters be merged first
|
||||||
|
if (Gain.first > BestGain.first ||
|
||||||
|
(std::abs(Gain.first - BestGain.first) < 1e-8 &&
|
||||||
|
compareClusterPairs(ClusterPred,
|
||||||
|
ClusterSucc,
|
||||||
|
BestClusterPred,
|
||||||
|
BestClusterSucc))) {
|
||||||
|
BestGain = Gain;
|
||||||
|
BestClusterPred = ClusterPred;
|
||||||
|
BestClusterSucc = ClusterSucc;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop merging when there is no improvement
|
||||||
|
if (BestGain.first <= 0.0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Merge the best pair of clusters
|
||||||
|
mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge cold blocks to reduce code size
|
||||||
|
void mergeColdClusters() {
|
||||||
|
for (auto SrcBB : BF.layout()) {
|
||||||
|
// Iterating in reverse order to make sure original fall-trough jumps are
|
||||||
|
// merged first
|
||||||
|
for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) {
|
||||||
|
BinaryBasicBlock *DstBB = *Itr;
|
||||||
|
auto SrcCluster = CurCluster[SrcBB->getLayoutIndex()];
|
||||||
|
auto DstCluster = CurCluster[DstBB->getLayoutIndex()];
|
||||||
|
if (SrcCluster != DstCluster && !DstCluster->isEntryPoint() &&
|
||||||
|
SrcCluster->blocks().back() == SrcBB &&
|
||||||
|
DstCluster->blocks().front() == DstBB) {
|
||||||
|
mergeClusters(SrcCluster, DstCluster, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// For a pair of blocks, A and B, block B is the fallthrough successor of A,
|
/// For a pair of blocks, A and B, block B is the fallthrough successor of A,
|
||||||
/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
|
/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
|
||||||
/// to B are from A. Such blocks should be adjacent in an optimal ordering,
|
/// to B are from A. Such blocks should be adjacent in an optimal ordering,
|
||||||
|
@ -558,11 +595,17 @@ private:
|
||||||
// Merge the blocks of clusters
|
// Merge the blocks of clusters
|
||||||
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
|
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
|
||||||
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
|
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
|
||||||
|
From->clear();
|
||||||
|
|
||||||
// Remove cluster From from the list of active clusters
|
// Remove cluster From from the list of active clusters
|
||||||
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
|
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
|
||||||
Clusters.erase(Iter, Clusters.end());
|
Clusters.erase(Iter, Clusters.end());
|
||||||
|
|
||||||
|
// Update block clusters
|
||||||
|
for (auto BB : Into->blocks()) {
|
||||||
|
CurCluster[BB->getLayoutIndex()] = Into;
|
||||||
|
}
|
||||||
|
|
||||||
// Invalidate caches
|
// Invalidate caches
|
||||||
Cache.invalidate(Into);
|
Cache.invalidate(Into);
|
||||||
|
|
||||||
|
@ -582,6 +625,9 @@ private:
|
||||||
// Active clusters. The vector gets udpated at runtime when clusters are merged
|
// Active clusters. The vector gets udpated at runtime when clusters are merged
|
||||||
std::vector<Cluster *> Clusters;
|
std::vector<Cluster *> Clusters;
|
||||||
|
|
||||||
|
// Current cluster of a basic block
|
||||||
|
std::vector<Cluster *> CurCluster;
|
||||||
|
|
||||||
// Size of the block
|
// Size of the block
|
||||||
std::vector<uint64_t> Size;
|
std::vector<uint64_t> Size;
|
||||||
|
|
||||||
|
|
|
@ -9,24 +9,6 @@
|
||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
// TODO: copyright/license msg.
|
|
||||||
|
|
||||||
/*
|
|
||||||
+----------------------------------------------------------------------+
|
|
||||||
| HipHop for PHP |
|
|
||||||
+----------------------------------------------------------------------+
|
|
||||||
| Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
|
|
||||||
+----------------------------------------------------------------------+
|
|
||||||
| This source file is subject to version 3.01 of the PHP license, |
|
|
||||||
| that is bundled with this package in the file LICENSE, and is |
|
|
||||||
| available through the world-wide-web at the following url: |
|
|
||||||
| http://www.php.net/license/3_01.txt |
|
|
||||||
| If you did not receive a copy of the PHP license and are unable to |
|
|
||||||
| obtain it through the world-wide-web, please send a note to |
|
|
||||||
| license@php.net so we can mail you a copy immediately. |
|
|
||||||
+----------------------------------------------------------------------+
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "BinaryFunction.h"
|
#include "BinaryFunction.h"
|
||||||
#include "HFSort.h"
|
#include "HFSort.h"
|
||||||
#include "ReorderUtils.h"
|
#include "ReorderUtils.h"
|
||||||
|
@ -112,14 +94,6 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
|
||||||
return A1->target(0) < A2->target(0);
|
return A1->target(0) < A2->target(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sorting clusters by their density in decreasing order.
|
|
||||||
template <typename C>
|
|
||||||
std::vector<Cluster *> sortByDensity(const C &Clusters_) {
|
|
||||||
std::vector<Cluster *> Clusters(Clusters_.begin(), Clusters_.end());
|
|
||||||
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
|
||||||
return Clusters;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// HFSortPlus - layout of hot functions with iTLB cache optimization
|
/// HFSortPlus - layout of hot functions with iTLB cache optimization
|
||||||
///
|
///
|
||||||
/// Given an ordering of hot functions (and hence, their assignment to the
|
/// Given an ordering of hot functions (and hence, their assignment to the
|
||||||
|
@ -398,15 +372,17 @@ public:
|
||||||
|
|
||||||
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
|
DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n");
|
||||||
|
|
||||||
|
// Sorting clusters by density in decreasing order
|
||||||
|
std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
|
||||||
|
|
||||||
// Return the set of clusters that are left, which are the ones that
|
// Return the set of clusters that are left, which are the ones that
|
||||||
// didn't get merged (so their first func is its original func)
|
// didn't get merged (so their first func is its original func)
|
||||||
std::vector<Cluster> Result;
|
std::vector<Cluster> Result;
|
||||||
for (auto Cluster : sortByDensity(Clusters)) {
|
Result.reserve(Clusters.size());
|
||||||
|
for (auto Cluster : Clusters) {
|
||||||
Result.emplace_back(std::move(*Cluster));
|
Result.emplace_back(std::move(*Cluster));
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity));
|
|
||||||
|
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -473,6 +449,7 @@ private:
|
||||||
Adjacent.merge(Into, From);
|
Adjacent.merge(Into, From);
|
||||||
|
|
||||||
Into->merge(*From);
|
Into->merge(*From);
|
||||||
|
From->clear();
|
||||||
|
|
||||||
// Update the clusters and addresses for functions merged from From.
|
// Update the clusters and addresses for functions merged from From.
|
||||||
size_t CurAddr = 0;
|
size_t CurAddr = 0;
|
||||||
|
|
|
@ -246,10 +246,6 @@ public:
|
||||||
/// A new reordering algorithm for basic blocks, cache+
|
/// A new reordering algorithm for basic blocks, cache+
|
||||||
class CachePlusReorderAlgorithm : public ReorderAlgorithm {
|
class CachePlusReorderAlgorithm : public ReorderAlgorithm {
|
||||||
public:
|
public:
|
||||||
explicit CachePlusReorderAlgorithm(
|
|
||||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
|
||||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
|
||||||
|
|
||||||
void reorderBasicBlocks(
|
void reorderBasicBlocks(
|
||||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue