forked from OSchip/llvm-project
[BOLT] Add option to print profile bias stats
Summary: Profile bias may happen depending on the hardware counter used to trigger LBR sampling, on the hardware implementation and as an intrinsic characteristic of relying on LBRs. Since we infer fall-through execution and these non-taken branches take zero hardware resources to be represented, LBR-based profile likely overrepresents paths with fall throughs and underrepresents paths with many taken branches. This patch adds an option to print statistics about profile bias so we can better understand these biases. The goal is to analyze differences in the sum of the frequency of all incoming edges in a basic block versus the sum of all outgoing. In an ideally sampled profile, these differences should be close to zero. With this option, the user gets the mean of these differences in flow as a percentage of the input flow. For example, if this number is 15%, it means, on average, a block observed 15% more or less flow going out of it in comparison with the flow going in. We also print the standard deviation so we can have an idea of how spread apart are different measurements of flow differences. If variance is low, it means the average bias is happening across all blocks, which is compatible with using LBRs. If the variance is high, it means some blocks in the profile have a much higher bias than others, which is compatible with using a biased event such as cycles to sample LBRs because it overrepresents paths that end in an expensive instruction. (cherry picked from FBD15790517)
This commit is contained in:
parent
1ec091e6f5
commit
bda13b7dd8
|
@ -200,6 +200,13 @@ PrintUCE("print-uce",
|
|||
cl::Hidden,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintProfileStats("print-profile-stats",
|
||||
cl::desc("print profile quality/bias analysis"),
|
||||
cl::ZeroOrMore,
|
||||
cl::init(false),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
SimplifyConditionalTailCalls("simplify-conditional-tail-calls",
|
||||
cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
|
||||
|
@ -369,6 +376,9 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
|
|||
// Run this pass first to use stats for the original functions.
|
||||
Manager.registerPass(llvm::make_unique<PrintProgramStats>(NeverPrint));
|
||||
|
||||
if (opts::PrintProfileStats)
|
||||
Manager.registerPass(llvm::make_unique<PrintProfileStats>(NeverPrint));
|
||||
|
||||
Manager.registerPass(llvm::make_unique<ValidateInternalCalls>(NeverPrint));
|
||||
|
||||
Manager.registerPass(llvm::make_unique<StripRepRet>(NeverPrint),
|
||||
|
|
|
@ -1252,6 +1252,99 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
|
|||
}
|
||||
}
|
||||
|
||||
void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
|
||||
double FlowImbalanceMean = 0.0;
|
||||
size_t NumBlocksConsidered = 0;
|
||||
double WorstBias = 0.0;
|
||||
const BinaryFunction *WorstBiasFunc = nullptr;
|
||||
|
||||
// For each function CFG, we fill an IncomingMap with the sum of the frequency
|
||||
// of incoming edges for each BB. Likewise for each OutgoingMap and the sum
|
||||
// of the frequency of outgoing edges.
|
||||
using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
|
||||
std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
|
||||
std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
|
||||
|
||||
// Compute mean
|
||||
for (const auto &BFI : BC.getBinaryFunctions()) {
|
||||
const BinaryFunction &Function = BFI.second;
|
||||
if (Function.empty() || !Function.isSimple())
|
||||
continue;
|
||||
FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
|
||||
FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
|
||||
for (const auto &BB : Function) {
|
||||
auto TotalOutgoing = 0ULL;
|
||||
auto SuccBIIter = BB.branch_info_begin();
|
||||
for (auto Succ : BB.successors()) {
|
||||
auto Count = SuccBIIter->Count;
|
||||
if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
|
||||
++SuccBIIter;
|
||||
continue;
|
||||
}
|
||||
TotalOutgoing += Count;
|
||||
IncomingMap[Succ] += Count;
|
||||
++SuccBIIter;
|
||||
}
|
||||
OutgoingMap[&BB] = TotalOutgoing;
|
||||
}
|
||||
|
||||
size_t NumBlocks = 0;
|
||||
double Mean = 0.0;
|
||||
for (const auto &BB : Function) {
|
||||
// Do not compute score for low frequency blocks, entry or exit blocks
|
||||
if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
|
||||
continue;
|
||||
++NumBlocks;
|
||||
const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
|
||||
Mean += fabs(Difference / IncomingMap[&BB]);
|
||||
}
|
||||
|
||||
FlowImbalanceMean += Mean;
|
||||
NumBlocksConsidered += NumBlocks;
|
||||
if (!NumBlocks)
|
||||
continue;
|
||||
double FuncMean = Mean / NumBlocks;
|
||||
if (FuncMean > WorstBias) {
|
||||
WorstBias = FuncMean;
|
||||
WorstBiasFunc = &Function;
|
||||
}
|
||||
}
|
||||
if (NumBlocksConsidered > 0)
|
||||
FlowImbalanceMean /= NumBlocksConsidered;
|
||||
|
||||
// Compute standard deviation
|
||||
NumBlocksConsidered = 0;
|
||||
double FlowImbalanceVar = 0.0;
|
||||
for (const auto &BFI : BC.getBinaryFunctions()) {
|
||||
const BinaryFunction &Function = BFI.second;
|
||||
if (Function.empty() || !Function.isSimple())
|
||||
continue;
|
||||
FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
|
||||
FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
|
||||
for (const auto &BB : Function) {
|
||||
if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
|
||||
continue;
|
||||
++NumBlocksConsidered;
|
||||
const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
|
||||
FlowImbalanceVar +=
|
||||
pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
|
||||
}
|
||||
}
|
||||
if (NumBlocksConsidered) {
|
||||
FlowImbalanceVar /= NumBlocksConsidered;
|
||||
FlowImbalanceVar = sqrt(FlowImbalanceVar);
|
||||
}
|
||||
|
||||
// Report to user
|
||||
outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
|
||||
(100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
|
||||
if (WorstBiasFunc && opts::Verbosity >= 1) {
|
||||
outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
|
||||
<< "\n";
|
||||
DEBUG(WorstBiasFunc->dump());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
PrintProgramStats::runOnFunctions(BinaryContext &BC) {
|
||||
uint64_t NumSimpleFunctions{0};
|
||||
|
|
|
@ -347,6 +347,25 @@ class AssignSections : public BinaryFunctionPass {
|
|||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Compute and report to the user the imbalance in flow equations for all
|
||||
/// CFGs, so we can detect bad quality profile. Prints average and standard
|
||||
/// deviation of the absolute differences of outgoing flow minus incoming flow
|
||||
/// for blocks of interest (excluding prologues, epilogues, and BB frequency
|
||||
/// lower than 100).
|
||||
class PrintProfileStats : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit PrintProfileStats(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "profile-stats";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &) const override {
|
||||
return false;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Prints a list of the top 100 functions sorted by a set of
|
||||
/// dyno stats categories.
|
||||
class PrintProgramStats : public BinaryFunctionPass {
|
||||
|
|
Loading…
Reference in New Issue