forked from OSchip/llvm-project
312 lines
12 KiB
C++
312 lines
12 KiB
C++
//===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the CacheMetrics class and functions for showing metrics
|
|
// of cache lines.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "bolt/Passes/CacheMetrics.h"
|
|
#include "bolt/Core/BinaryBasicBlock.h"
|
|
#include "bolt/Core/BinaryFunction.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include <unordered_map>
|
|
|
|
using namespace llvm;
|
|
using namespace bolt;
|
|
|
|
namespace opts {
|
|
|
|
extern cl::OptionCategory BoltOptCategory;
|
|
|
|
extern cl::opt<double> ForwardWeight;
|
|
extern cl::opt<double> BackwardWeight;
|
|
extern cl::opt<unsigned> ForwardDistance;
|
|
extern cl::opt<unsigned> BackwardDistance;
|
|
extern cl::opt<unsigned> ITLBPageSize;
|
|
extern cl::opt<unsigned> ITLBEntries;
|
|
|
|
} // namespace opts
|
|
|
|
namespace {
|
|
|
|
/// Initialize and return a position map for binary basic blocks
|
|
void extractBasicBlockInfo(
|
|
const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
|
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
const BinaryContext &BC = BF->getBinaryContext();
|
|
for (BinaryBasicBlock *BB : BF->layout()) {
|
|
if (BF->isSimple() || BC.HasRelocations) {
|
|
// Use addresses/sizes as in the output binary
|
|
BBAddr[BB] = BB->getOutputAddressRange().first;
|
|
BBSize[BB] = BB->getOutputSize();
|
|
} else {
|
|
// Output ranges should match the input if the body hasn't changed
|
|
BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress();
|
|
BBSize[BB] = BB->getOriginalSize();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Calculate TSP metric, which quantifies the number of fallthrough jumps in
|
|
/// the ordering of basic blocks
|
|
double
|
|
calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
|
|
|
double Score = 0;
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
if (!BF->hasProfile())
|
|
continue;
|
|
for (BinaryBasicBlock *SrcBB : BF->layout()) {
|
|
auto BI = SrcBB->branch_info_begin();
|
|
for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
|
|
if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
|
|
BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB))
|
|
Score += BI->Count;
|
|
++BI;
|
|
}
|
|
}
|
|
}
|
|
return Score;
|
|
}
|
|
|
|
/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
|
|
/// misses for a given ordering of basic blocks
|
|
double calcExtTSPScore(
|
|
const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
|
|
|
double Score = 0.0;
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
if (!BF->hasProfile())
|
|
continue;
|
|
for (BinaryBasicBlock *SrcBB : BF->layout()) {
|
|
auto BI = SrcBB->branch_info_begin();
|
|
for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
|
|
if (DstBB != SrcBB)
|
|
Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB), BBSize.at(SrcBB),
|
|
BBAddr.at(DstBB), BI->Count);
|
|
++BI;
|
|
}
|
|
}
|
|
}
|
|
return Score;
|
|
}
|
|
|
|
using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
|
|
|
|
/// Build a simplified version of the call graph: For every function, keep
|
|
/// its callers and the frequencies of the calls
|
|
std::unordered_map<const BinaryFunction *, Predecessors>
|
|
extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
|
|
std::unordered_map<const BinaryFunction *, Predecessors> Calls;
|
|
|
|
for (BinaryFunction *SrcFunction : BinaryFunctions) {
|
|
const BinaryContext &BC = SrcFunction->getBinaryContext();
|
|
for (BinaryBasicBlock *BB : SrcFunction->layout()) {
|
|
// Find call instructions and extract target symbols from each one
|
|
for (MCInst &Inst : *BB) {
|
|
if (!BC.MIB->isCall(Inst))
|
|
continue;
|
|
|
|
// Call info
|
|
const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
|
|
uint64_t Count = BB->getKnownExecutionCount();
|
|
// Ignore calls w/o information
|
|
if (DstSym == nullptr || Count == 0)
|
|
continue;
|
|
|
|
const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
|
|
// Ignore recursive calls
|
|
if (DstFunction == nullptr || DstFunction->layout_empty() ||
|
|
DstFunction == SrcFunction)
|
|
continue;
|
|
|
|
// Record the call
|
|
Calls[DstFunction].emplace_back(SrcFunction, Count);
|
|
}
|
|
}
|
|
}
|
|
return Calls;
|
|
}
|
|
|
|
/// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
|
|
/// Given an assignment of functions to the i-TLB pages), we divide all
|
|
/// functions calls into two categories:
|
|
/// - 'short' ones that have a caller-callee distance less than a page;
|
|
/// - 'long' ones where the distance exceeds a page.
|
|
/// The short calls are likely to result in a i-TLB cache hit. For the long
|
|
/// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
|
|
/// often the page is accessed). Assuming that functions are sent to the i-TLB
|
|
/// cache in a random order, the probability that a page is present in the cache
|
|
/// is proportional to the number of samples corresponding to the functions on
|
|
/// the page. The following procedure detects short and long calls, and
|
|
/// estimates the expected number of cache misses for the long ones.
|
|
double expectedCacheHitRatio(
|
|
const std::vector<BinaryFunction *> &BinaryFunctions,
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
|
|
const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
|
|
|
|
const double PageSize = opts::ITLBPageSize;
|
|
const uint64_t CacheEntries = opts::ITLBEntries;
|
|
std::unordered_map<const BinaryFunction *, Predecessors> Calls =
|
|
extractFunctionCalls(BinaryFunctions);
|
|
// Compute 'hotness' of the functions
|
|
double TotalSamples = 0;
|
|
std::unordered_map<BinaryFunction *, double> FunctionSamples;
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
double Samples = 0;
|
|
for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
|
|
Samples += Pair.second;
|
|
Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
|
|
FunctionSamples[BF] = Samples;
|
|
TotalSamples += Samples;
|
|
}
|
|
|
|
// Compute 'hotness' of the pages
|
|
std::unordered_map<uint64_t, double> PageSamples;
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
if (BF->layout_empty())
|
|
continue;
|
|
double Page = BBAddr.at(BF->layout_front()) / PageSize;
|
|
PageSamples[Page] += FunctionSamples.at(BF);
|
|
}
|
|
|
|
// Computing the expected number of misses for every function
|
|
double Misses = 0;
|
|
for (BinaryFunction *BF : BinaryFunctions) {
|
|
// Skip the function if it has no samples
|
|
if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0)
|
|
continue;
|
|
double Samples = FunctionSamples.at(BF);
|
|
double Page = BBAddr.at(BF->layout_front()) / PageSize;
|
|
// The probability that the page is not present in the cache
|
|
double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
|
|
|
|
// Processing all callers of the function
|
|
for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
|
|
BinaryFunction *SrcFunction = Pair.first;
|
|
double SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize;
|
|
// Is this a 'long' or a 'short' call?
|
|
if (Page != SrcPage) {
|
|
// This is a miss
|
|
Misses += MissProb * Pair.second;
|
|
}
|
|
Samples -= Pair.second;
|
|
}
|
|
assert(Samples >= 0.0 && "Function samples computed incorrectly");
|
|
// The remaining samples likely come from the jitted code
|
|
Misses += Samples * MissProb;
|
|
}
|
|
|
|
return 100.0 * (1.0 - Misses / TotalSamples);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
double CacheMetrics::extTSPScore(uint64_t SrcAddr, uint64_t SrcSize,
|
|
uint64_t DstAddr, uint64_t Count) {
|
|
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
|
|
|
|
// Fallthrough
|
|
if (SrcAddr + SrcSize == DstAddr) {
|
|
// Assume that FallthroughWeight = 1.0 after normalization
|
|
return static_cast<double>(Count);
|
|
}
|
|
// Forward
|
|
if (SrcAddr + SrcSize < DstAddr) {
|
|
const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
|
|
if (Dist <= opts::ForwardDistance) {
|
|
double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
|
|
return opts::ForwardWeight * Prob * Count;
|
|
}
|
|
return 0;
|
|
}
|
|
// Backward
|
|
const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
|
|
if (Dist <= opts::BackwardDistance) {
|
|
double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
|
|
return opts::BackwardWeight * Prob * Count;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
|
|
// Stats related to hot-cold code splitting
|
|
size_t NumFunctions = 0;
|
|
size_t NumProfiledFunctions = 0;
|
|
size_t NumHotFunctions = 0;
|
|
size_t NumBlocks = 0;
|
|
size_t NumHotBlocks = 0;
|
|
|
|
size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
|
|
size_t TotalCodeMaxAddr = 0;
|
|
size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
|
|
size_t HotCodeMaxAddr = 0;
|
|
|
|
for (BinaryFunction *BF : BFs) {
|
|
NumFunctions++;
|
|
if (BF->hasProfile())
|
|
NumProfiledFunctions++;
|
|
if (BF->hasValidIndex())
|
|
NumHotFunctions++;
|
|
for (BinaryBasicBlock *BB : BF->layout()) {
|
|
NumBlocks++;
|
|
size_t BBAddrMin = BB->getOutputAddressRange().first;
|
|
size_t BBAddrMax = BB->getOutputAddressRange().second;
|
|
TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
|
|
TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
|
|
if (BF->hasValidIndex() && !BB->isCold()) {
|
|
NumHotBlocks++;
|
|
HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
|
|
HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
|
|
}
|
|
}
|
|
}
|
|
|
|
outs() << format(" There are %zu functions;", NumFunctions)
|
|
<< format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
|
|
100.0 * NumHotFunctions / NumFunctions)
|
|
<< format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
|
|
100.0 * NumProfiledFunctions / NumFunctions);
|
|
outs() << format(" There are %zu basic blocks;", NumBlocks)
|
|
<< format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
|
|
100.0 * NumHotBlocks / NumBlocks);
|
|
|
|
assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
|
|
size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
|
|
size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
|
|
|
|
size_t HugePage2MB = 2 << 20;
|
|
outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
|
|
"%.2lf huge pages)\n",
|
|
100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
|
|
TotalCodeSize, double(HotCodeSize) / HugePage2MB);
|
|
|
|
// Stats related to expected cache performance
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
|
|
std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
|
|
extractBasicBlockInfo(BFs, BBAddr, BBSize);
|
|
|
|
outs() << " Expected i-TLB cache hit ratio: "
|
|
<< format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
|
|
|
|
outs() << " TSP score: "
|
|
<< format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize));
|
|
|
|
outs() << " ExtTSP score: "
|
|
<< format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize));
|
|
}
|