2017-10-17 07:53:50 +08:00
|
|
|
//===------ CacheMetrics.cpp - Calculate metrics for instruction cache ----===//
|
2017-06-14 07:29:39 +08:00
|
|
|
//
|
2021-03-16 09:04:18 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-06-14 07:29:39 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2021-03-16 09:04:18 +08:00
|
|
|
// Functions to show metrics of cache lines
|
|
|
|
//
|
2017-06-14 07:29:39 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2021-10-09 02:47:10 +08:00
|
|
|
#include "bolt/Passes/CacheMetrics.h"
|
|
|
|
#include "bolt/Core/BinaryBasicBlock.h"
|
|
|
|
#include "bolt/Core/BinaryFunction.h"
|
2020-12-02 08:29:39 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2021-05-01 04:54:02 +08:00
|
|
|
#include <unordered_map>
|
2017-06-14 07:29:39 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
using namespace bolt;
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
|
|
|
|
namespace opts {
|
|
|
|
|
|
|
|
extern cl::OptionCategory BoltOptCategory;
|
|
|
|
|
2019-11-01 04:32:25 +08:00
|
|
|
extern cl::opt<double> ForwardWeight;
|
|
|
|
extern cl::opt<double> BackwardWeight;
|
|
|
|
extern cl::opt<unsigned> ForwardDistance;
|
|
|
|
extern cl::opt<unsigned> BackwardDistance;
|
|
|
|
extern cl::opt<unsigned> ITLBPageSize;
|
|
|
|
extern cl::opt<unsigned> ITLBEntries;
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
} // namespace opts
|
2017-11-15 08:51:24 +08:00
|
|
|
|
2017-06-14 07:29:39 +08:00
|
|
|
namespace {
|
|
|
|
|
2017-11-15 08:51:24 +08:00
|
|
|
/// Initialize and return a position map for binary basic blocks
|
|
|
|
void extractBasicBlockInfo(
|
2021-12-15 08:52:51 +08:00
|
|
|
const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
2017-11-15 08:51:24 +08:00
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
|
|
const BinaryContext &BC = BF->getBinaryContext();
|
|
|
|
for (BinaryBasicBlock *BB : BF->layout()) {
|
2018-05-12 03:03:19 +08:00
|
|
|
if (BF->isSimple() || BC.HasRelocations) {
|
|
|
|
// Use addresses/sizes as in the output binary
|
|
|
|
BBAddr[BB] = BB->getOutputAddressRange().first;
|
|
|
|
BBSize[BB] = BB->getOutputSize();
|
|
|
|
} else {
|
|
|
|
// Output ranges should match the input if the body hasn't changed
|
|
|
|
BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress();
|
|
|
|
BBSize[BB] = BB->getOriginalSize();
|
|
|
|
}
|
2017-11-15 08:51:24 +08:00
|
|
|
}
|
2017-06-14 07:29:39 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-15 08:51:24 +08:00
|
|
|
/// Calculate TSP metric, which quantifies the number of fallthrough jumps in
|
|
|
|
/// the ordering of basic blocks
|
2021-12-15 08:52:51 +08:00
|
|
|
double
|
|
|
|
calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
2017-11-15 08:51:24 +08:00
|
|
|
|
|
|
|
double Score = 0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
2018-05-12 03:03:19 +08:00
|
|
|
if (!BF->hasProfile())
|
|
|
|
continue;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *SrcBB : BF->layout()) {
|
2017-11-15 08:51:24 +08:00
|
|
|
auto BI = SrcBB->branch_info_begin();
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
|
2017-11-15 08:51:24 +08:00
|
|
|
if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
|
|
|
|
BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB))
|
|
|
|
Score += BI->Count;
|
|
|
|
++BI;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Score;
|
|
|
|
}
|
|
|
|
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
|
|
|
|
/// misses for a given ordering of basic blocks
|
2017-11-15 08:51:24 +08:00
|
|
|
double calcExtTSPScore(
|
2021-12-15 08:52:51 +08:00
|
|
|
const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
2017-11-15 08:51:24 +08:00
|
|
|
|
|
|
|
double Score = 0.0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
2018-02-10 01:58:19 +08:00
|
|
|
if (!BF->hasProfile())
|
|
|
|
continue;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *SrcBB : BF->layout()) {
|
2017-11-15 08:51:24 +08:00
|
|
|
auto BI = SrcBB->branch_info_begin();
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
|
2017-11-15 08:51:24 +08:00
|
|
|
if (DstBB != SrcBB) {
|
2021-12-15 08:52:51 +08:00
|
|
|
Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB), BBSize.at(SrcBB),
|
|
|
|
BBAddr.at(DstBB), BI->Count);
|
2017-11-15 08:51:24 +08:00
|
|
|
}
|
|
|
|
++BI;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Score;
|
|
|
|
}
|
|
|
|
|
|
|
|
using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
|
|
|
|
|
|
|
|
/// Build a simplified version of the call graph: For every function, keep
|
|
|
|
/// its callers and the frequencies of the calls
|
|
|
|
std::unordered_map<const BinaryFunction *, Predecessors>
|
|
|
|
extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
|
|
|
|
std::unordered_map<const BinaryFunction *, Predecessors> Calls;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *SrcFunction : BinaryFunctions) {
|
|
|
|
const BinaryContext &BC = SrcFunction->getBinaryContext();
|
|
|
|
for (BinaryBasicBlock *BB : SrcFunction->layout()) {
|
2017-11-15 08:51:24 +08:00
|
|
|
// Find call instructions and extract target symbols from each one
|
2021-04-08 15:19:26 +08:00
|
|
|
for (MCInst &Inst : *BB) {
|
2018-03-10 01:45:13 +08:00
|
|
|
if (!BC.MIB->isCall(Inst))
|
2017-11-15 08:51:24 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Call info
|
2021-12-15 08:52:51 +08:00
|
|
|
const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
|
2021-04-08 15:19:26 +08:00
|
|
|
uint64_t Count = BB->getKnownExecutionCount();
|
2017-11-15 08:51:24 +08:00
|
|
|
// Ignore calls w/o information
|
|
|
|
if (DstSym == nullptr || Count == 0)
|
|
|
|
continue;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
|
2017-11-15 08:51:24 +08:00
|
|
|
// Ignore recursive calls
|
2021-12-15 08:52:51 +08:00
|
|
|
if (DstFunction == nullptr || DstFunction->layout_empty() ||
|
2017-11-15 08:51:24 +08:00
|
|
|
DstFunction == SrcFunction)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Record the call
|
2021-05-08 09:43:25 +08:00
|
|
|
Calls[DstFunction].emplace_back(SrcFunction, Count);
|
2017-11-15 08:51:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Calls;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
|
|
|
|
/// Given an assignment of functions to the i-TLB pages), we divide all
|
|
|
|
/// functions calls into two categories:
|
|
|
|
/// - 'short' ones that have a caller-callee distance less than a page;
|
|
|
|
/// - 'long' ones where the distance exceeds a page.
|
2021-12-15 08:52:51 +08:00
|
|
|
/// The short calls are likely to result in a i-TLB cache hit. For the long
|
|
|
|
/// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
|
|
|
|
/// often the page is accessed). Assuming that functions are sent to the i-TLB
|
|
|
|
/// cache in a random order, the probability that a page is present in the cache
|
|
|
|
/// is proportional to the number of samples corresponding to the functions on
|
|
|
|
/// the page. The following procedure detects short and long calls, and
|
|
|
|
/// estimates the expected number of cache misses for the long ones.
|
2017-11-15 08:51:24 +08:00
|
|
|
double expectedCacheHitRatio(
|
2021-12-15 08:52:51 +08:00
|
|
|
const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
2017-11-15 08:51:24 +08:00
|
|
|
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
const double PageSize = opts::ITLBPageSize;
|
|
|
|
const uint64_t CacheEntries = opts::ITLBEntries;
|
2021-04-08 15:19:26 +08:00
|
|
|
std::unordered_map<const BinaryFunction *, Predecessors> Calls =
|
|
|
|
extractFunctionCalls(BinaryFunctions);
|
2017-11-15 08:51:24 +08:00
|
|
|
// Compute 'hotness' of the functions
|
|
|
|
double TotalSamples = 0;
|
|
|
|
std::unordered_map<BinaryFunction *, double> FunctionSamples;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
2017-11-15 08:51:24 +08:00
|
|
|
double Samples = 0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
|
2017-11-15 08:51:24 +08:00
|
|
|
Samples += Pair.second;
|
|
|
|
}
|
|
|
|
Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
|
|
|
|
FunctionSamples[BF] = Samples;
|
|
|
|
TotalSamples += Samples;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute 'hotness' of the pages
|
|
|
|
std::unordered_map<uint64_t, double> PageSamples;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
2017-11-15 08:51:24 +08:00
|
|
|
if (BF->layout_empty())
|
|
|
|
continue;
|
2021-04-08 15:19:26 +08:00
|
|
|
double Page = BBAddr.at(BF->layout_front()) / PageSize;
|
2017-11-15 08:51:24 +08:00
|
|
|
PageSamples[Page] += FunctionSamples.at(BF);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Computing the expected number of misses for every function
|
|
|
|
double Misses = 0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
2017-11-15 08:51:24 +08:00
|
|
|
// Skip the function if it has no samples
|
|
|
|
if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0)
|
|
|
|
continue;
|
|
|
|
double Samples = FunctionSamples.at(BF);
|
2021-04-08 15:19:26 +08:00
|
|
|
double Page = BBAddr.at(BF->layout_front()) / PageSize;
|
2017-11-15 08:51:24 +08:00
|
|
|
// The probability that the page is not present in the cache
|
|
|
|
double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
|
|
|
|
|
|
|
|
// Processing all callers of the function
|
2021-04-08 15:19:26 +08:00
|
|
|
for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
|
|
|
|
BinaryFunction *SrcFunction = Pair.first;
|
|
|
|
double SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize;
|
2017-11-15 08:51:24 +08:00
|
|
|
// Is this a 'long' or a 'short' call?
|
|
|
|
if (Page != SrcPage) {
|
|
|
|
// This is a miss
|
|
|
|
Misses += MissProb * Pair.second;
|
|
|
|
}
|
|
|
|
Samples -= Pair.second;
|
|
|
|
}
|
|
|
|
assert(Samples >= 0.0 && "Function samples computed incorrectly");
|
|
|
|
// The remaining samples likely come from the jitted code
|
|
|
|
Misses += Samples * MissProb;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 100.0 * (1.0 - Misses / TotalSamples);
|
|
|
|
}
|
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
} // namespace
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize,
|
|
|
|
uint64_t DstAddr, uint64_t Count) {
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
|
|
|
|
|
|
|
|
// Fallthrough
|
|
|
|
if (SrcAddr + SrcSize == DstAddr) {
|
2021-01-28 10:29:16 +08:00
|
|
|
// Assume that FallthroughWeight = 1.0 after normalization
|
|
|
|
return static_cast<double>(Count);
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
}
|
|
|
|
// Forward
|
|
|
|
if (SrcAddr + SrcSize < DstAddr) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
if (Dist <= opts::ForwardDistance) {
|
|
|
|
double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
|
|
|
|
return opts::ForwardWeight * Prob * Count;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// Backward
|
2021-04-08 15:19:26 +08:00
|
|
|
const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
if (Dist <= opts::BackwardDistance) {
|
|
|
|
double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
|
|
|
|
return opts::BackwardWeight * Prob * Count;
|
|
|
|
}
|
|
|
|
return 0;
|
2017-10-17 07:53:50 +08:00
|
|
|
}
|
|
|
|
|
2018-03-29 00:10:25 +08:00
|
|
|
void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
|
|
|
|
// Stats related to hot-cold code splitting
|
2017-10-17 07:53:50 +08:00
|
|
|
size_t NumFunctions = 0;
|
2018-03-29 00:10:25 +08:00
|
|
|
size_t NumProfiledFunctions = 0;
|
2017-10-17 07:53:50 +08:00
|
|
|
size_t NumHotFunctions = 0;
|
|
|
|
size_t NumBlocks = 0;
|
|
|
|
size_t NumHotBlocks = 0;
|
|
|
|
|
2018-03-29 00:10:25 +08:00
|
|
|
size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
|
|
|
|
size_t TotalCodeMaxAddr = 0;
|
|
|
|
size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
|
|
|
|
size_t HotCodeMaxAddr = 0;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *BF : BFs) {
|
2017-10-17 07:53:50 +08:00
|
|
|
NumFunctions++;
|
2018-03-29 00:10:25 +08:00
|
|
|
if (BF->hasProfile())
|
|
|
|
NumProfiledFunctions++;
|
|
|
|
if (BF->hasValidIndex())
|
2017-10-17 07:53:50 +08:00
|
|
|
NumHotFunctions++;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *BB : BF->layout()) {
|
2017-10-17 07:53:50 +08:00
|
|
|
NumBlocks++;
|
2018-03-29 00:10:25 +08:00
|
|
|
size_t BBAddrMin = BB->getOutputAddressRange().first;
|
|
|
|
size_t BBAddrMax = BB->getOutputAddressRange().second;
|
|
|
|
TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
|
|
|
|
TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
|
|
|
|
if (BF->hasValidIndex() && !BB->isCold()) {
|
2017-10-17 07:53:50 +08:00
|
|
|
NumHotBlocks++;
|
2018-03-29 00:10:25 +08:00
|
|
|
HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
|
|
|
|
HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
|
|
|
|
}
|
2017-10-17 07:53:50 +08:00
|
|
|
}
|
2017-06-14 07:29:39 +08:00
|
|
|
}
|
|
|
|
|
2017-10-17 07:53:50 +08:00
|
|
|
outs() << format(" There are %zu functions;", NumFunctions)
|
2021-12-15 08:52:51 +08:00
|
|
|
<< format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
|
|
|
|
100.0 * NumHotFunctions / NumFunctions)
|
|
|
|
<< format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
|
|
|
|
100.0 * NumProfiledFunctions / NumFunctions);
|
2017-10-17 07:53:50 +08:00
|
|
|
outs() << format(" There are %zu basic blocks;", NumBlocks)
|
2021-12-15 08:52:51 +08:00
|
|
|
<< format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
|
|
|
|
100.0 * NumHotBlocks / NumBlocks);
|
2017-10-17 07:53:50 +08:00
|
|
|
|
2019-11-01 04:32:25 +08:00
|
|
|
assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
|
2018-03-29 00:10:25 +08:00
|
|
|
size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
|
|
|
|
size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
|
|
|
|
|
|
|
|
size_t HugePage2MB = 2 << 20;
|
2021-12-15 08:52:51 +08:00
|
|
|
outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
|
|
|
|
"%.2lf huge pages)\n",
|
|
|
|
100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
|
|
|
|
TotalCodeSize, double(HotCodeSize) / HugePage2MB);
|
2018-03-29 00:10:25 +08:00
|
|
|
|
|
|
|
// Stats related to expected cache performance
|
2017-11-15 08:51:24 +08:00
|
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
|
|
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
|
2018-03-29 00:10:25 +08:00
|
|
|
extractBasicBlockInfo(BFs, BBAddr, BBSize);
|
2017-11-15 08:51:24 +08:00
|
|
|
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
outs() << " Expected i-TLB cache hit ratio: "
|
2018-03-29 00:10:25 +08:00
|
|
|
<< format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
|
2017-11-15 08:51:24 +08:00
|
|
|
|
|
|
|
outs() << " TSP score: "
|
2018-03-29 00:10:25 +08:00
|
|
|
<< format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize));
|
2017-11-15 08:51:24 +08:00
|
|
|
|
[BOLT] a new block reordering algorithm
Summary:
A new block reordering algorithm, cache+, that is designed to optimize
i-cache performance.
On a high level, this algorithm is a greedy heuristic that merges
clusters (ordered sequences) of basic blocks, similarly to how it is
done in OptimizeCacheReorderAlgorithm. There are two important
differences: (a) the metric that is optimized in the procedure, and
(b) how two clusters are merged together.
Initially all clusters are isolated basic blocks. On every iteration,
we pick a pair of clusters whose merging yields the biggest increase
in the ExtTSP metric (see CacheMetrics.cpp for exact implementation),
which models how i-cache "friendly" a pecific cluster is. A pair of
clusters giving the maximum gain is merged to a new clusters. The
procedure stops when there is only one cluster left, or when merging
does not increase ExtTSP. In the latter case, the remaining clusters
are sorted by density.
An important aspect is the way two clusters are merged. Unlike earlier
algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
clusters, X and Y, are first split into three, X1, X2, and Y. Then we
consider all possible ways of gluing the three clusters (e.g., X1YX2,
X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the
largest score. This improves the quality of the final result (the
search space is larger) while keeping the implementation sufficiently
fast.
(cherry picked from FBD6466264)
2017-12-02 08:54:08 +08:00
|
|
|
outs() << " ExtTSP score: "
|
2018-03-29 00:10:25 +08:00
|
|
|
<< format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize));
|
2017-06-14 07:29:39 +08:00
|
|
|
}
|