forked from OSchip/llvm-project
1447 lines
55 KiB
C++
1447 lines
55 KiB
C++
//===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the IndirectCallPromotion class.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "bolt/Passes/IndirectCallPromotion.h"
|
|
#include "bolt/Passes/BinaryFunctionCallGraph.h"
|
|
#include "bolt/Passes/DataflowInfoManager.h"
|
|
#include "bolt/Passes/Inliner.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#define DEBUG_TYPE "ICP"
|
|
#define DEBUG_VERBOSE(Level, X) \
|
|
if (opts::Verbosity >= (Level)) { \
|
|
X; \
|
|
}
|
|
|
|
using namespace llvm;
|
|
using namespace bolt;
|
|
|
|
namespace opts {
|
|
|
|
extern cl::OptionCategory BoltOptCategory;
|
|
|
|
extern cl::opt<IndirectCallPromotionType> ICP;
|
|
extern cl::opt<unsigned> Verbosity;
|
|
extern cl::opt<unsigned> ExecutionCountThreshold;
|
|
|
|
static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
|
|
"icp-jt-remaining-percent-threshold",
|
|
cl::desc("The percentage threshold against remaining unpromoted indirect "
|
|
"call count for the promotion for jump tables"),
|
|
cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPJTTotalPercentThreshold(
|
|
"icp-jt-total-percent-threshold",
|
|
cl::desc(
|
|
"The percentage threshold against total count for the promotion for "
|
|
"jump tables"),
|
|
cl::init(5), cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPCallsRemainingPercentThreshold(
|
|
"icp-calls-remaining-percent-threshold",
|
|
cl::desc("The percentage threshold against remaining unpromoted indirect "
|
|
"call count for the promotion for calls"),
|
|
cl::init(50), cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPCallsTotalPercentThreshold(
|
|
"icp-calls-total-percent-threshold",
|
|
cl::desc(
|
|
"The percentage threshold against total count for the promotion for "
|
|
"calls"),
|
|
cl::init(30), cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPMispredictThreshold(
|
|
"indirect-call-promotion-mispredict-threshold",
|
|
cl::desc("misprediction threshold for skipping ICP on an "
|
|
"indirect call"),
|
|
cl::init(0), cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<bool> ICPUseMispredicts(
|
|
"indirect-call-promotion-use-mispredicts",
|
|
cl::desc("use misprediction frequency for determining whether or not ICP "
|
|
"should be applied at a callsite. The "
|
|
"-indirect-call-promotion-mispredict-threshold value will be used "
|
|
"by this heuristic"),
|
|
cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned>
|
|
ICPTopN("indirect-call-promotion-topn",
|
|
cl::desc("limit number of targets to consider when doing indirect "
|
|
"call promotion. 0 = no limit"),
|
|
cl::init(3), cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPCallsTopN(
|
|
"indirect-call-promotion-calls-topn",
|
|
cl::desc("limit number of targets to consider when doing indirect "
|
|
"call promotion on calls. 0 = no limit"),
|
|
cl::init(0), cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPJumpTablesTopN(
|
|
"indirect-call-promotion-jump-tables-topn",
|
|
cl::desc("limit number of targets to consider when doing indirect "
|
|
"call promotion on jump tables. 0 = no limit"),
|
|
cl::init(0), cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<bool> EliminateLoads(
|
|
"icp-eliminate-loads",
|
|
cl::desc("enable load elimination using memory profiling data when "
|
|
"performing ICP"),
|
|
cl::init(true), cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<unsigned> ICPTopCallsites(
|
|
"icp-top-callsites",
|
|
cl::desc("optimize hottest calls until at least this percentage of all "
|
|
"indirect calls frequency is covered. 0 = all callsites"),
|
|
cl::init(99), cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::list<std::string>
|
|
ICPFuncsList("icp-funcs", cl::CommaSeparated,
|
|
cl::desc("list of functions to enable ICP for"),
|
|
cl::value_desc("func1,func2,func3,..."), cl::Hidden,
|
|
cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<bool>
|
|
ICPOldCodeSequence("icp-old-code-sequence",
|
|
cl::desc("use old code sequence for promoted calls"),
|
|
cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<bool> ICPJumpTablesByTarget(
|
|
"icp-jump-tables-targets",
|
|
cl::desc(
|
|
"for jump tables, optimize indirect jmp targets instead of indices"),
|
|
cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
static cl::opt<bool> ICPPeelForInline(
|
|
"icp-inline", cl::desc("only promote call targets eligible for inlining"),
|
|
cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
} // namespace opts
|
|
|
|
static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
|
|
bool IsValid = true;
|
|
for (auto &BFI : BFs) {
|
|
BinaryFunction &BF = BFI.second;
|
|
if (!BF.isSimple())
|
|
continue;
|
|
for (BinaryBasicBlock *BB : BF.layout()) {
|
|
auto BI = BB->branch_info_begin();
|
|
for (BinaryBasicBlock *SuccBB : BB->successors()) {
|
|
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
|
|
if (BB->getKnownExecutionCount() == 0 ||
|
|
SuccBB->getKnownExecutionCount() == 0) {
|
|
errs() << "BOLT-WARNING: profile verification failed after ICP for "
|
|
"function "
|
|
<< BF << '\n';
|
|
IsValid = false;
|
|
}
|
|
}
|
|
++BI;
|
|
}
|
|
}
|
|
}
|
|
return IsValid;
|
|
}
|
|
|
|
namespace llvm {
|
|
namespace bolt {
|
|
|
|
IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
|
|
const IndirectCallProfile &ICP)
|
|
: From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds),
|
|
Branches(ICP.Count) {
|
|
if (ICP.Symbol) {
|
|
To.Sym = ICP.Symbol;
|
|
To.Addr = 0;
|
|
}
|
|
}
|
|
|
|
void IndirectCallPromotion::printDecision(
|
|
llvm::raw_ostream &OS,
|
|
std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
|
|
uint64_t TotalCount = 0;
|
|
uint64_t TotalMispreds = 0;
|
|
for (const Callsite &S : Targets) {
|
|
TotalCount += S.Branches;
|
|
TotalMispreds += S.Mispreds;
|
|
}
|
|
if (!TotalCount)
|
|
TotalCount = 1;
|
|
if (!TotalMispreds)
|
|
TotalMispreds = 1;
|
|
|
|
OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
|
|
<< " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
|
|
<< "\n";
|
|
|
|
size_t I = 0;
|
|
for (const Callsite &S : Targets) {
|
|
OS << "Count = " << S.Branches << ", "
|
|
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
|
|
<< "Mispreds = " << S.Mispreds << ", "
|
|
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
|
|
if (I < N)
|
|
OS << " * to be optimized *";
|
|
if (!S.JTIndices.empty()) {
|
|
OS << " Indices:";
|
|
for (const uint64_t Idx : S.JTIndices)
|
|
OS << " " << Idx;
|
|
}
|
|
OS << "\n";
|
|
I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
|
|
}
|
|
}
|
|
|
|
// Get list of targets for a given call sorted by most frequently
|
|
// called first.
|
|
std::vector<IndirectCallPromotion::Callsite>
|
|
IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
|
|
const MCInst &Inst) const {
|
|
BinaryFunction &BF = *BB.getFunction();
|
|
const BinaryContext &BC = BF.getBinaryContext();
|
|
std::vector<Callsite> Targets;
|
|
|
|
if (const JumpTable *JT = BF.getJumpTable(Inst)) {
|
|
// Don't support PIC jump tables for now
|
|
if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
|
|
return Targets;
|
|
const Location From(BF.getSymbol());
|
|
const std::pair<size_t, size_t> Range =
|
|
JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
|
|
assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
|
|
JumpTable::JumpInfo DefaultJI;
|
|
const JumpTable::JumpInfo *JI =
|
|
JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
|
|
const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
|
|
assert(JT->Type == JumpTable::JTT_PIC ||
|
|
JT->EntrySize == BC.AsmInfo->getCodePointerSize());
|
|
for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
|
|
MCSymbol *Entry = JT->Entries[I];
|
|
assert(BF.getBasicBlockForLabel(Entry) ||
|
|
Entry == BF.getFunctionEndLabel() ||
|
|
Entry == BF.getFunctionColdEndLabel());
|
|
if (Entry == BF.getFunctionEndLabel() ||
|
|
Entry == BF.getFunctionColdEndLabel())
|
|
continue;
|
|
const Location To(Entry);
|
|
const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Entry);
|
|
Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count,
|
|
I - Range.first);
|
|
}
|
|
|
|
// Sort by symbol then addr.
|
|
std::sort(Targets.begin(), Targets.end(),
|
|
[](const Callsite &A, const Callsite &B) {
|
|
if (A.To.Sym && B.To.Sym)
|
|
return A.To.Sym < B.To.Sym;
|
|
else if (A.To.Sym && !B.To.Sym)
|
|
return true;
|
|
else if (!A.To.Sym && B.To.Sym)
|
|
return false;
|
|
else
|
|
return A.To.Addr < B.To.Addr;
|
|
});
|
|
|
|
// Targets may contain multiple entries to the same target, but using
|
|
// different indices. Their profile will report the same number of branches
|
|
// for different indices if the target is the same. That's because we don't
|
|
// profile the index value, but only the target via LBR.
|
|
auto First = Targets.begin();
|
|
auto Last = Targets.end();
|
|
auto Result = First;
|
|
while (++First != Last) {
|
|
Callsite &A = *Result;
|
|
const Callsite &B = *First;
|
|
if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym)
|
|
A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
|
|
B.JTIndices.end());
|
|
else
|
|
*(++Result) = *First;
|
|
}
|
|
++Result;
|
|
|
|
LLVM_DEBUG(if (Targets.end() - Result > 0) {
|
|
dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
|
|
<< " duplicate targets removed\n";
|
|
});
|
|
|
|
Targets.erase(Result, Targets.end());
|
|
} else {
|
|
// Don't try to optimize PC relative indirect calls.
|
|
if (Inst.getOperand(0).isReg() &&
|
|
Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter())
|
|
return Targets;
|
|
|
|
const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(
|
|
Inst, "CallProfile");
|
|
if (ICSP) {
|
|
for (const IndirectCallProfile &CSP : ICSP.get()) {
|
|
Callsite Site(BF, CSP);
|
|
if (Site.isValid())
|
|
Targets.emplace_back(std::move(Site));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort by target count, number of indices in case of jump table, and
|
|
// mispredicts. We prioritize targets with high count, small number of indices
|
|
// and high mispredicts. Break ties by selecting targets with lower addresses.
|
|
std::stable_sort(Targets.begin(), Targets.end(),
|
|
[](const Callsite &A, const Callsite &B) {
|
|
if (A.Branches != B.Branches)
|
|
return A.Branches > B.Branches;
|
|
if (A.JTIndices.size() != B.JTIndices.size())
|
|
return A.JTIndices.size() < B.JTIndices.size();
|
|
if (A.Mispreds != B.Mispreds)
|
|
return A.Mispreds > B.Mispreds;
|
|
return A.To.Addr < B.To.Addr;
|
|
});
|
|
|
|
// Remove non-symbol targets
|
|
auto Last = std::remove_if(Targets.begin(), Targets.end(),
|
|
[](const Callsite &CS) { return !CS.To.Sym; });
|
|
Targets.erase(Last, Targets.end());
|
|
|
|
LLVM_DEBUG(if (BF.getJumpTable(Inst)) {
|
|
uint64_t TotalCount = 0;
|
|
uint64_t TotalMispreds = 0;
|
|
for (const Callsite &S : Targets) {
|
|
TotalCount += S.Branches;
|
|
TotalMispreds += S.Mispreds;
|
|
}
|
|
if (!TotalCount)
|
|
TotalCount = 1;
|
|
if (!TotalMispreds)
|
|
TotalMispreds = 1;
|
|
|
|
dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
|
|
<< ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds
|
|
<< "\n";
|
|
|
|
size_t I = 0;
|
|
for (const Callsite &S : Targets) {
|
|
dbgs() << "Count[" << I << "] = " << S.Branches << ", "
|
|
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
|
|
<< "Mispreds[" << I << "] = " << S.Mispreds << ", "
|
|
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n";
|
|
++I;
|
|
}
|
|
});
|
|
|
|
return Targets;
|
|
}
|
|
|
|
IndirectCallPromotion::JumpTableInfoType
|
|
IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
|
|
MCInst &CallInst,
|
|
MCInst *&TargetFetchInst,
|
|
const JumpTable *JT) const {
|
|
assert(JT && "Can't get jump table addrs for non-jump tables.");
|
|
|
|
BinaryFunction &Function = *BB.getFunction();
|
|
BinaryContext &BC = Function.getBinaryContext();
|
|
|
|
if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
|
|
return JumpTableInfoType();
|
|
|
|
JumpTableInfoType HotTargets;
|
|
MCInst *MemLocInstr;
|
|
MCInst *PCRelBaseOut;
|
|
unsigned BaseReg, IndexReg;
|
|
int64_t DispValue;
|
|
const MCExpr *DispExpr;
|
|
MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
|
|
const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
|
|
CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
|
|
MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
|
|
|
|
assert(MemLocInstr && "There should always be a load for jump tables");
|
|
if (!MemLocInstr)
|
|
return JumpTableInfoType();
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
|
|
<< "jump table in " << Function << " at @ "
|
|
<< (&CallInst - &BB.front()) << "\n"
|
|
<< "BOLT-INFO: ICP target fetch instructions:\n";
|
|
BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
|
|
if (MemLocInstr != &CallInst)
|
|
BC.printInstruction(dbgs(), CallInst, 0, &Function);
|
|
});
|
|
|
|
DEBUG_VERBOSE(1, {
|
|
dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
|
|
<< "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
|
|
<< "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
|
|
<< "DispValue = " << Twine::utohexstr(DispValue) << ", "
|
|
<< "DispExpr = " << DispExpr << ", "
|
|
<< "MemLocInstr = ";
|
|
BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
|
|
dbgs() << "\n";
|
|
});
|
|
|
|
++TotalIndexBasedCandidates;
|
|
|
|
auto ErrorOrMemAccesssProfile =
|
|
BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
|
|
"MemoryAccessProfile");
|
|
if (!ErrorOrMemAccesssProfile) {
|
|
DEBUG_VERBOSE(1, dbgs()
|
|
<< "BOLT-INFO: ICP no memory profiling data found\n");
|
|
return JumpTableInfoType();
|
|
}
|
|
MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();
|
|
|
|
uint64_t ArrayStart;
|
|
if (DispExpr) {
|
|
ErrorOr<uint64_t> DispValueOrError =
|
|
BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
|
|
assert(DispValueOrError && "global symbol needs a value");
|
|
ArrayStart = *DispValueOrError;
|
|
} else {
|
|
ArrayStart = static_cast<uint64_t>(DispValue);
|
|
}
|
|
|
|
if (BaseReg == BC.MRI->getProgramCounter())
|
|
ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
|
|
|
|
// This is a map of [symbol] -> [count, index] and is used to combine indices
|
|
// into the jump table since there may be multiple addresses that all have the
|
|
// same entry.
|
|
std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
|
|
const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);
|
|
|
|
for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
|
|
size_t Index;
|
|
// Mem data occasionally includes nullprs, ignore them.
|
|
if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
|
|
continue;
|
|
|
|
if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
|
|
return JumpTableInfoType();
|
|
|
|
if (AccessInfo.MemoryObject) {
|
|
// Deal with bad/stale data
|
|
if (!AccessInfo.MemoryObject->getName().startswith(
|
|
"JUMP_TABLE/" + Function.getOneName().str()))
|
|
return JumpTableInfoType();
|
|
Index =
|
|
(AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
|
|
} else {
|
|
Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
|
|
}
|
|
|
|
// If Index is out of range it probably means the memory profiling data is
|
|
// wrong for this instruction, bail out.
|
|
if (Index >= Range.second) {
|
|
LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
|
|
<< ", " << Range.second << "\n");
|
|
return JumpTableInfoType();
|
|
}
|
|
|
|
// Make sure the hot index points at a legal label corresponding to a BB,
|
|
// e.g. not the end of function (unreachable) label.
|
|
if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
|
|
LLVM_DEBUG({
|
|
dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
|
|
<< "label " << JT->Entries[Index + Range.first]->getName()
|
|
<< " in jump table:\n";
|
|
JT->print(dbgs());
|
|
dbgs() << "HotTargetMap:\n";
|
|
for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
|
|
HotTargetMap)
|
|
dbgs() << "BOLT-INFO: " << HT.first->getName()
|
|
<< " = (count=" << HT.second.first
|
|
<< ", index=" << HT.second.second << ")\n";
|
|
});
|
|
return JumpTableInfoType();
|
|
}
|
|
|
|
std::pair<uint64_t, uint64_t> &HotTarget =
|
|
HotTargetMap[JT->Entries[Index + Range.first]];
|
|
HotTarget.first += AccessInfo.Count;
|
|
HotTarget.second = Index;
|
|
}
|
|
|
|
std::transform(
|
|
HotTargetMap.begin(), HotTargetMap.end(), std::back_inserter(HotTargets),
|
|
[](const std::pair<MCSymbol *, std::pair<uint64_t, uint64_t>> &A) {
|
|
return A.second;
|
|
});
|
|
|
|
// Sort with highest counts first.
|
|
std::sort(HotTargets.rbegin(), HotTargets.rend());
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
|
|
for (const std::pair<uint64_t, uint64_t> &Target : HotTargets)
|
|
dbgs() << "BOLT-INFO: Idx = " << Target.second << ", "
|
|
<< "Count = " << Target.first << "\n";
|
|
});
|
|
|
|
BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;
|
|
|
|
TargetFetchInst = MemLocInstr;
|
|
|
|
return HotTargets;
|
|
}
|
|
|
|
IndirectCallPromotion::SymTargetsType
|
|
IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
|
|
size_t &N, BinaryBasicBlock &BB,
|
|
MCInst &CallInst,
|
|
MCInst *&TargetFetchInst) const {
|
|
const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
|
|
SymTargetsType SymTargets;
|
|
|
|
if (!JT) {
|
|
for (size_t I = 0; I < N; ++I) {
|
|
assert(Targets[I].To.Sym && "All ICP targets must be to known symbols");
|
|
assert(Targets[I].JTIndices.empty() &&
|
|
"Can't have jump table indices for non-jump tables");
|
|
SymTargets.emplace_back(Targets[I].To.Sym, 0);
|
|
}
|
|
return SymTargets;
|
|
}
|
|
|
|
// Use memory profile to select hot targets.
|
|
JumpTableInfoType HotTargets =
|
|
maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);
|
|
|
|
auto findTargetsIndex = [&](uint64_t JTIndex) {
|
|
for (size_t I = 0; I < Targets.size(); ++I)
|
|
if (llvm::is_contained(Targets[I].JTIndices, JTIndex))
|
|
return I;
|
|
LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
|
|
<< " table entry in " << *BB.getFunction() << "\n");
|
|
llvm_unreachable("Hot indices must be referred to by at least one "
|
|
"callsite");
|
|
};
|
|
|
|
if (!HotTargets.empty()) {
|
|
if (opts::Verbosity >= 1)
|
|
for (size_t I = 0; I < HotTargets.size(); ++I)
|
|
outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first
|
|
<< ", " << HotTargets[I].second << ")\n";
|
|
|
|
// Recompute hottest targets, now discriminating which index is hot
|
|
// NOTE: This is a tradeoff. On one hand, we get index information. On the
|
|
// other hand, info coming from the memory profile is much less accurate
|
|
// than LBRs. So we may actually end up working with more coarse
|
|
// profile granularity in exchange for information about indices.
|
|
std::vector<Callsite> NewTargets;
|
|
std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
|
|
uint64_t TotalMemAccesses = 0;
|
|
for (size_t I = 0; I < HotTargets.size(); ++I) {
|
|
const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
|
|
++IndicesPerTarget[Targets[TargetIndex].To.Sym];
|
|
TotalMemAccesses += HotTargets[I].first;
|
|
}
|
|
uint64_t RemainingMemAccesses = TotalMemAccesses;
|
|
const size_t TopN =
|
|
opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN;
|
|
size_t I = 0;
|
|
for (; I < HotTargets.size(); ++I) {
|
|
const uint64_t MemAccesses = HotTargets[I].first;
|
|
if (100 * MemAccesses <
|
|
TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
|
|
break;
|
|
if (100 * MemAccesses <
|
|
RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
|
|
break;
|
|
if (TopN && I >= TopN)
|
|
break;
|
|
RemainingMemAccesses -= MemAccesses;
|
|
|
|
const uint64_t JTIndex = HotTargets[I].second;
|
|
Callsite &Target = Targets[findTargetsIndex(JTIndex)];
|
|
|
|
NewTargets.push_back(Target);
|
|
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
|
|
Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
|
|
Target.JTIndices.end(), JTIndex),
|
|
Target.JTIndices.end());
|
|
|
|
// Keep fixCFG counts sane if more indices use this same target later
|
|
assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
|
|
NewTargets.back().Branches =
|
|
Target.Branches / IndicesPerTarget[Target.To.Sym];
|
|
NewTargets.back().Mispreds =
|
|
Target.Mispreds / IndicesPerTarget[Target.To.Sym];
|
|
assert(Target.Branches >= NewTargets.back().Branches);
|
|
assert(Target.Mispreds >= NewTargets.back().Mispreds);
|
|
Target.Branches -= NewTargets.back().Branches;
|
|
Target.Mispreds -= NewTargets.back().Mispreds;
|
|
}
|
|
std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
|
|
std::swap(NewTargets, Targets);
|
|
N = I;
|
|
|
|
if (N == 0 && opts::Verbosity >= 1) {
|
|
outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
|
|
<< BB.getName() << ": failed to meet thresholds after memory "
|
|
<< "profile data was loaded.\n";
|
|
return SymTargets;
|
|
}
|
|
}
|
|
|
|
for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
|
|
Callsite &Target = Targets[TgtIdx];
|
|
assert(Target.To.Sym && "All ICP targets must be to known symbols");
|
|
assert(!Target.JTIndices.empty() && "Jump tables must have indices");
|
|
for (uint64_t Idx : Target.JTIndices) {
|
|
SymTargets.emplace_back(Target.To.Sym, Idx);
|
|
++I;
|
|
}
|
|
}
|
|
|
|
return SymTargets;
|
|
}
|
|
|
|
IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
|
|
BinaryBasicBlock &BB, MCInst &Inst,
|
|
const SymTargetsType &SymTargets) const {
|
|
BinaryFunction &Function = *BB.getFunction();
|
|
BinaryContext &BC = Function.getBinaryContext();
|
|
std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
|
|
std::vector<MCInst *> MethodFetchInsns;
|
|
unsigned VtableReg, MethodReg;
|
|
uint64_t MethodOffset;
|
|
|
|
assert(!Function.getJumpTable(Inst) &&
|
|
"Can't get vtable addrs for jump tables.");
|
|
|
|
if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
|
|
return MethodInfoType();
|
|
|
|
MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
|
|
if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(),
|
|
MethodFetchInsns, VtableReg, MethodReg,
|
|
MethodOffset)) {
|
|
DEBUG_VERBOSE(
|
|
1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
|
|
<< Function << " at @ " << (&Inst - &BB.front()) << "\n");
|
|
return MethodInfoType();
|
|
}
|
|
|
|
++TotalMethodLoadEliminationCandidates;
|
|
|
|
DEBUG_VERBOSE(1, {
|
|
dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
|
|
<< " at @ " << (&Inst - &BB.front()) << "\n";
|
|
dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
|
|
for (MCInst *Inst : MethodFetchInsns)
|
|
BC.printInstruction(dbgs(), *Inst, 0, &Function);
|
|
|
|
if (MethodFetchInsns.back() != &Inst)
|
|
BC.printInstruction(dbgs(), Inst, 0, &Function);
|
|
});
|
|
|
|
// Try to get value profiling data for the method load instruction.
|
|
auto ErrorOrMemAccesssProfile =
|
|
BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
|
|
"MemoryAccessProfile");
|
|
if (!ErrorOrMemAccesssProfile) {
|
|
DEBUG_VERBOSE(1, dbgs()
|
|
<< "BOLT-INFO: ICP no memory profiling data found\n");
|
|
return MethodInfoType();
|
|
}
|
|
MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();
|
|
|
|
// Find the vtable that each method belongs to.
|
|
std::map<const MCSymbol *, uint64_t> MethodToVtable;
|
|
|
|
for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
|
|
uint64_t Address = AccessInfo.Offset;
|
|
if (AccessInfo.MemoryObject)
|
|
Address += AccessInfo.MemoryObject->getAddress();
|
|
|
|
// Ignore bogus data.
|
|
if (!Address)
|
|
continue;
|
|
|
|
const uint64_t VtableBase = Address - MethodOffset;
|
|
|
|
DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
|
|
<< Twine::utohexstr(VtableBase) << "+"
|
|
<< MethodOffset << "/" << AccessInfo.Count << "\n");
|
|
|
|
if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
|
|
BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
|
|
if (!MethodBD) // skip unknown methods
|
|
continue;
|
|
MCSymbol *MethodSym = MethodBD->getSymbol();
|
|
MethodToVtable[MethodSym] = VtableBase;
|
|
DEBUG_VERBOSE(1, {
|
|
const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
|
|
dbgs() << "BOLT-INFO: ICP found method = "
|
|
<< Twine::utohexstr(MethodAddr.get()) << "/"
|
|
<< (Method ? Method->getPrintName() : "") << "\n";
|
|
});
|
|
}
|
|
}
|
|
|
|
// Find the vtable for each target symbol.
|
|
for (size_t I = 0; I < SymTargets.size(); ++I) {
|
|
auto Itr = MethodToVtable.find(SymTargets[I].first);
|
|
if (Itr != MethodToVtable.end()) {
|
|
if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
|
|
const uint64_t Addend = Itr->second - BD->getAddress();
|
|
VtableSyms.emplace_back(BD->getSymbol(), Addend);
|
|
continue;
|
|
}
|
|
}
|
|
// Give up if we can't find the vtable for a method.
|
|
DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
|
|
<< SymTargets[I].first->getName() << "\n");
|
|
return MethodInfoType();
|
|
}
|
|
|
|
// Make sure the vtable reg is not clobbered by the argument passing code
|
|
if (VtableReg != MethodReg) {
|
|
for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
|
|
++CurInst) {
|
|
const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
|
|
if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI))
|
|
return MethodInfoType();
|
|
}
|
|
}
|
|
|
|
return MethodInfoType(VtableSyms, MethodFetchInsns);
|
|
}
|
|
|
|
std::vector<std::unique_ptr<BinaryBasicBlock>>
|
|
IndirectCallPromotion::rewriteCall(
|
|
BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
|
|
MCPlusBuilder::BlocksVectorTy &&ICPcode,
|
|
const std::vector<MCInst *> &MethodFetchInsns) const {
|
|
BinaryFunction &Function = *IndCallBlock.getFunction();
|
|
MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
|
|
|
|
// Create new basic blocks with correct code in each one first.
|
|
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
|
|
const bool IsTailCallOrJT =
|
|
(MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));
|
|
|
|
// Move instructions from the tail of the original call block
|
|
// to the merge block.
|
|
|
|
// Remember any pseudo instructions following a tail call. These
|
|
// must be preserved and moved to the original block.
|
|
InstructionListType TailInsts;
|
|
const MCInst *TailInst = &CallInst;
|
|
if (IsTailCallOrJT)
|
|
while (TailInst + 1 < &(*IndCallBlock.end()) &&
|
|
MIB->isPseudo(*(TailInst + 1)))
|
|
TailInsts.push_back(*++TailInst);
|
|
|
|
InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
|
|
// Link new BBs to the original input offset of the BB where the indirect
|
|
// call site is, so we can map samples recorded in new BBs back to the
|
|
// original BB seen in the input binary (if using BAT)
|
|
const uint32_t OrigOffset = IndCallBlock.getInputOffset();
|
|
|
|
IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
|
|
MethodFetchInsns.end());
|
|
if (IndCallBlock.empty() ||
|
|
(!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst))
|
|
IndCallBlock.addInstructions(ICPcode.front().second.begin(),
|
|
ICPcode.front().second.end());
|
|
else
|
|
IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()),
|
|
ICPcode.front().second);
|
|
IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end());
|
|
|
|
for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
|
|
MCSymbol *&Sym = Itr->first;
|
|
InstructionListType &Insts = Itr->second;
|
|
assert(Sym);
|
|
std::unique_ptr<BinaryBasicBlock> TBB =
|
|
Function.createBasicBlock(OrigOffset, Sym);
|
|
for (MCInst &Inst : Insts) // sanitize new instructions.
|
|
if (MIB->isCall(Inst))
|
|
MIB->removeAnnotation(Inst, "CallProfile");
|
|
TBB->addInstructions(Insts.begin(), Insts.end());
|
|
NewBBs.emplace_back(std::move(TBB));
|
|
}
|
|
|
|
// Move tail of instructions from after the original call to
|
|
// the merge block.
|
|
if (!IsTailCallOrJT)
|
|
NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
|
|
|
|
return NewBBs;
|
|
}
|
|
|
|
BinaryBasicBlock *
|
|
IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
|
|
const bool IsTailCall, const bool IsJumpTable,
|
|
IndirectCallPromotion::BasicBlocksVector &&NewBBs,
|
|
const std::vector<Callsite> &Targets) const {
|
|
BinaryFunction &Function = *IndCallBlock.getFunction();
|
|
using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
|
|
BinaryBasicBlock *MergeBlock = nullptr;
|
|
|
|
// Scale indirect call counts to the execution count of the original
|
|
// basic block containing the indirect call.
|
|
uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
|
|
uint64_t TotalIndirectBranches = 0;
|
|
for (const Callsite &Target : Targets)
|
|
TotalIndirectBranches += Target.Branches;
|
|
if (TotalIndirectBranches == 0)
|
|
TotalIndirectBranches = 1;
|
|
BinaryBasicBlock::BranchInfoType BBI;
|
|
BinaryBasicBlock::BranchInfoType ScaledBBI;
|
|
for (const Callsite &Target : Targets) {
|
|
const size_t NumEntries =
|
|
std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
|
|
for (size_t I = 0; I < NumEntries; ++I) {
|
|
BBI.push_back(
|
|
BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
|
|
(Target.Mispreds + NumEntries - 1) / NumEntries});
|
|
ScaledBBI.push_back(
|
|
BinaryBranchInfo{uint64_t(TotalCount * Target.Branches /
|
|
(NumEntries * TotalIndirectBranches)),
|
|
uint64_t(TotalCount * Target.Mispreds /
|
|
(NumEntries * TotalIndirectBranches))});
|
|
}
|
|
}
|
|
|
|
if (IsJumpTable) {
|
|
BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
|
|
IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock);
|
|
|
|
std::vector<MCSymbol *> SymTargets;
|
|
for (const Callsite &Target : Targets) {
|
|
const size_t NumEntries =
|
|
std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
|
|
for (size_t I = 0; I < NumEntries; ++I)
|
|
SymTargets.push_back(Target.To.Sym);
|
|
}
|
|
assert(SymTargets.size() > NewBBs.size() - 1 &&
|
|
"There must be a target symbol associated with each new BB.");
|
|
|
|
for (uint64_t I = 0; I < NewBBs.size(); ++I) {
|
|
BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
|
|
SourceBB->setExecutionCount(TotalCount);
|
|
|
|
BinaryBasicBlock *TargetBB =
|
|
Function.getBasicBlockForLabel(SymTargets[I]);
|
|
SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken
|
|
|
|
TotalCount -= ScaledBBI[I].Count;
|
|
SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through
|
|
|
|
// Update branch info for the indirect jump.
|
|
BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
|
|
NewIndCallBlock->getBranchInfo(*TargetBB);
|
|
if (BranchInfo.Count > BBI[I].Count)
|
|
BranchInfo.Count -= BBI[I].Count;
|
|
else
|
|
BranchInfo.Count = 0;
|
|
|
|
if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
|
|
BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
|
|
else
|
|
BranchInfo.MispredictedCount = 0;
|
|
}
|
|
} else {
|
|
assert(NewBBs.size() >= 2);
|
|
assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
|
|
assert(NewBBs.size() % 2 == 1 || IsTailCall);
|
|
|
|
auto ScaledBI = ScaledBBI.begin();
|
|
auto updateCurrentBranchInfo = [&] {
|
|
assert(ScaledBI != ScaledBBI.end());
|
|
TotalCount -= ScaledBI->Count;
|
|
++ScaledBI;
|
|
};
|
|
|
|
if (!IsTailCall) {
|
|
MergeBlock = NewBBs.back().get();
|
|
IndCallBlock.moveAllSuccessorsTo(MergeBlock);
|
|
}
|
|
|
|
// Fix up successors and execution counts.
|
|
updateCurrentBranchInfo();
|
|
IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount);
|
|
IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]);
|
|
|
|
const size_t Adj = IsTailCall ? 1 : 2;
|
|
for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
|
|
assert(TotalCount <= IndCallBlock.getExecutionCount() ||
|
|
TotalCount <= uint64_t(TotalIndirectBranches));
|
|
uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
|
|
if (I % 2 == 0) {
|
|
if (MergeBlock)
|
|
NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count);
|
|
} else {
|
|
assert(I + 2 < NewBBs.size());
|
|
updateCurrentBranchInfo();
|
|
NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount);
|
|
NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]);
|
|
ExecCount += TotalCount;
|
|
}
|
|
NewBBs[I]->setExecutionCount(ExecCount);
|
|
}
|
|
|
|
if (MergeBlock) {
|
|
// Arrange for the MergeBlock to be the fallthrough for the first
|
|
// promoted call block.
|
|
std::unique_ptr<BinaryBasicBlock> MBPtr;
|
|
std::swap(MBPtr, NewBBs.back());
|
|
NewBBs.pop_back();
|
|
NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
|
|
// TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
|
|
NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
|
|
}
|
|
}
|
|
|
|
// Update the execution count.
|
|
NewBBs.back()->setExecutionCount(TotalCount);
|
|
|
|
// Update BB and BB layout.
|
|
Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs));
|
|
assert(Function.validateCFG());
|
|
|
|
return MergeBlock;
|
|
}
|
|
|
|
size_t IndirectCallPromotion::canPromoteCallsite(
|
|
const BinaryBasicBlock &BB, const MCInst &Inst,
|
|
const std::vector<Callsite> &Targets, uint64_t NumCalls) {
|
|
BinaryFunction *BF = BB.getFunction();
|
|
const BinaryContext &BC = BF->getBinaryContext();
|
|
|
|
if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
|
|
return 0;
|
|
|
|
const bool IsJumpTable = BF->getJumpTable(Inst);
|
|
|
|
auto computeStats = [&](size_t N) {
|
|
for (size_t I = 0; I < N; ++I)
|
|
if (IsJumpTable)
|
|
TotalNumFrequentJmps += Targets[I].Branches;
|
|
else
|
|
TotalNumFrequentCalls += Targets[I].Branches;
|
|
};
|
|
|
|
// If we have no targets (or no calls), skip this callsite.
|
|
if (Targets.empty() || !NumCalls) {
|
|
if (opts::Verbosity >= 1) {
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
|
outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in "
|
|
<< BB.getName() << ", calls = " << NumCalls
|
|
<< ", targets empty or NumCalls == 0.\n";
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
size_t TopN = opts::ICPTopN;
|
|
if (IsJumpTable)
|
|
TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN;
|
|
else
|
|
TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN;
|
|
|
|
const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
|
|
|
|
if (opts::ICPTopCallsites > 0) {
|
|
if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
|
|
return 0;
|
|
}
|
|
|
|
// Pick the top N targets.
|
|
uint64_t TotalMispredictsTopN = 0;
|
|
size_t N = 0;
|
|
|
|
if (opts::ICPUseMispredicts &&
|
|
(!IsJumpTable || opts::ICPJumpTablesByTarget)) {
|
|
// Count total number of mispredictions for (at most) the top N targets.
|
|
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
|
// is exceeded by fewer targets.
|
|
double Threshold = double(opts::ICPMispredictThreshold);
|
|
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
|
|
Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
|
|
TotalMispredictsTopN += Targets[I].Mispreds;
|
|
}
|
|
computeStats(N);
|
|
|
|
// Compute the misprediction frequency of the top N call targets. If this
|
|
// frequency is greater than the threshold, we should try ICP on this
|
|
// callsite.
|
|
const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
|
|
if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
|
|
if (opts::Verbosity >= 1) {
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
|
outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
|
|
<< " in " << BB.getName() << ", calls = " << NumCalls
|
|
<< ", top N mis. frequency " << format("%.1f", TopNFrequency)
|
|
<< "% < " << opts::ICPMispredictThreshold << "%\n";
|
|
}
|
|
return 0;
|
|
}
|
|
} else {
|
|
size_t MaxTargets = 0;
|
|
|
|
// Count total number of calls for (at most) the top N targets.
|
|
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
|
// is exceeded by fewer targets.
|
|
const unsigned TotalThreshold = IsJumpTable
|
|
? opts::ICPJTTotalPercentThreshold
|
|
: opts::ICPCallsTotalPercentThreshold;
|
|
const unsigned RemainingThreshold =
|
|
IsJumpTable ? opts::ICPJTRemainingPercentThreshold
|
|
: opts::ICPCallsRemainingPercentThreshold;
|
|
uint64_t NumRemainingCalls = NumCalls;
|
|
for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
|
|
if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
|
|
break;
|
|
if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
|
|
break;
|
|
if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
|
|
TrialN)
|
|
break;
|
|
TotalMispredictsTopN += Targets[I].Mispreds;
|
|
NumRemainingCalls -= Targets[I].Branches;
|
|
N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
|
|
}
|
|
computeStats(MaxTargets);
|
|
|
|
// Don't check misprediction frequency for jump tables -- we don't really
|
|
// care as long as we are saving loads from the jump table.
|
|
if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
|
|
// Compute the misprediction frequency of the top N call targets. If
|
|
// this frequency is less than the threshold, we should skip ICP at
|
|
// this callsite.
|
|
const double TopNMispredictFrequency =
|
|
(100.0 * TotalMispredictsTopN) / NumCalls;
|
|
|
|
if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
|
|
if (opts::Verbosity >= 1) {
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
|
outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
|
|
<< " in " << BB.getName() << ", calls = " << NumCalls
|
|
<< ", top N mispredict frequency "
|
|
<< format("%.1f", TopNMispredictFrequency) << "% < "
|
|
<< opts::ICPMispredictThreshold << "%\n";
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Filter by inline-ability of target functions, stop at first target that
|
|
// can't be inlined.
|
|
if (opts::ICPPeelForInline) {
|
|
for (size_t I = 0; I < N; ++I) {
|
|
const MCSymbol *TargetSym = Targets[I].To.Sym;
|
|
const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym);
|
|
if (!BinaryFunctionPass::shouldOptimize(*TargetBF) ||
|
|
getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) {
|
|
N = I;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Filter functions that can have ICP applied (for debugging)
|
|
if (!opts::ICPFuncsList.empty()) {
|
|
for (std::string &Name : opts::ICPFuncsList)
|
|
if (BF->hasName(Name))
|
|
return N;
|
|
return 0;
|
|
}
|
|
|
|
return N;
|
|
}
|
|
|
|
void IndirectCallPromotion::printCallsiteInfo(
|
|
const BinaryBasicBlock &BB, const MCInst &Inst,
|
|
const std::vector<Callsite> &Targets, const size_t N,
|
|
uint64_t NumCalls) const {
|
|
BinaryContext &BC = BB.getFunction()->getBinaryContext();
|
|
const bool IsTailCall = BC.MIB->isTailCall(Inst);
|
|
const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
|
|
|
outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
|
|
<< " @ " << InstIdx << " in " << BB.getName()
|
|
<< " -> calls = " << NumCalls
|
|
<< (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
|
|
<< "\n";
|
|
for (size_t I = 0; I < N; I++) {
|
|
const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
|
|
const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
|
|
outs() << "BOLT-INFO: ";
|
|
if (Targets[I].To.Sym)
|
|
outs() << Targets[I].To.Sym->getName();
|
|
else
|
|
outs() << Targets[I].To.Addr;
|
|
outs() << ", calls = " << Targets[I].Branches
|
|
<< ", mispreds = " << Targets[I].Mispreds
|
|
<< ", taken freq = " << format("%.1f", Frequency) << "%"
|
|
<< ", mis. freq = " << format("%.1f", MisFrequency) << "%";
|
|
bool First = true;
|
|
for (uint64_t JTIndex : Targets[I].JTIndices) {
|
|
outs() << (First ? ", indices = " : ", ") << JTIndex;
|
|
First = false;
|
|
}
|
|
outs() << "\n";
|
|
}
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "BOLT-INFO: ICP original call instruction:";
|
|
BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
|
|
});
|
|
}
|
|
|
|
void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
|
|
if (opts::ICP == ICP_NONE)
|
|
return;
|
|
|
|
auto &BFs = BC.getBinaryFunctions();
|
|
|
|
const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL);
|
|
const bool OptimizeJumpTables =
|
|
(opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL);
|
|
|
|
std::unique_ptr<RegAnalysis> RA;
|
|
std::unique_ptr<BinaryFunctionCallGraph> CG;
|
|
if (OptimizeJumpTables) {
|
|
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
|
|
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
|
|
}
|
|
|
|
// If icp-top-callsites is enabled, compute the total number of indirect
|
|
// calls and then optimize the hottest callsites that contribute to that
|
|
// total.
|
|
SetVector<BinaryFunction *> Functions;
|
|
if (opts::ICPTopCallsites == 0) {
|
|
for (auto &KV : BFs)
|
|
Functions.insert(&KV.second);
|
|
} else {
|
|
using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
|
|
std::vector<IndirectCallsite> IndirectCalls;
|
|
size_t TotalIndirectCalls = 0;
|
|
|
|
// Find all the indirect callsites.
|
|
for (auto &BFIt : BFs) {
|
|
BinaryFunction &Function = BFIt.second;
|
|
|
|
if (!Function.isSimple() || Function.isIgnored() ||
|
|
!Function.hasProfile())
|
|
continue;
|
|
|
|
const bool HasLayout = !Function.layout_empty();
|
|
|
|
for (BinaryBasicBlock &BB : Function) {
|
|
if (HasLayout && Function.isSplit() && BB.isCold())
|
|
continue;
|
|
|
|
for (MCInst &Inst : BB) {
|
|
const bool IsJumpTable = Function.getJumpTable(Inst);
|
|
const bool HasIndirectCallProfile =
|
|
BC.MIB->hasAnnotation(Inst, "CallProfile");
|
|
const bool IsDirectCall =
|
|
(BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0));
|
|
|
|
if (!IsDirectCall &&
|
|
((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
|
|
(IsJumpTable && OptimizeJumpTables))) {
|
|
uint64_t NumCalls = 0;
|
|
for (const Callsite &BInfo : getCallTargets(BB, Inst))
|
|
NumCalls += BInfo.Branches;
|
|
IndirectCalls.push_back(
|
|
std::make_tuple(NumCalls, &Inst, &Function));
|
|
TotalIndirectCalls += NumCalls;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort callsites by execution count.
|
|
std::sort(IndirectCalls.rbegin(), IndirectCalls.rend());
|
|
|
|
// Find callsites that contribute to the top "opts::ICPTopCallsites"%
|
|
// number of calls.
|
|
const float TopPerc = opts::ICPTopCallsites / 100.0f;
|
|
int64_t MaxCalls = TotalIndirectCalls * TopPerc;
|
|
uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
|
|
size_t Num = 0;
|
|
for (const IndirectCallsite &IC : IndirectCalls) {
|
|
const uint64_t CurFreq = std::get<0>(IC);
|
|
// Once we decide to stop, include at least all branches that share the
|
|
// same frequency of the last one to avoid non-deterministic behavior
|
|
// (e.g. turning on/off ICP depending on the order of functions)
|
|
if (MaxCalls <= 0 && CurFreq != LastFreq)
|
|
break;
|
|
MaxCalls -= CurFreq;
|
|
LastFreq = CurFreq;
|
|
BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
|
|
Functions.insert(std::get<2>(IC));
|
|
++Num;
|
|
}
|
|
outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
|
|
<< ", " << Num << " callsites cover " << opts::ICPTopCallsites
|
|
<< "% of all indirect calls\n";
|
|
}
|
|
|
|
for (BinaryFunction *FuncPtr : Functions) {
|
|
BinaryFunction &Function = *FuncPtr;
|
|
|
|
if (!Function.isSimple() || Function.isIgnored() || !Function.hasProfile())
|
|
continue;
|
|
|
|
const bool HasLayout = !Function.layout_empty();
|
|
|
|
// Total number of indirect calls issued from the current Function.
|
|
// (a fraction of TotalIndirectCalls)
|
|
uint64_t FuncTotalIndirectCalls = 0;
|
|
uint64_t FuncTotalIndirectJmps = 0;
|
|
|
|
std::vector<BinaryBasicBlock *> BBs;
|
|
for (BinaryBasicBlock &BB : Function) {
|
|
// Skip indirect calls in cold blocks.
|
|
if (!HasLayout || !Function.isSplit() || !BB.isCold())
|
|
BBs.push_back(&BB);
|
|
}
|
|
if (BBs.empty())
|
|
continue;
|
|
|
|
DataflowInfoManager Info(Function, RA.get(), nullptr);
|
|
while (!BBs.empty()) {
|
|
BinaryBasicBlock *BB = BBs.back();
|
|
BBs.pop_back();
|
|
|
|
for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
|
|
MCInst &Inst = BB->getInstructionAtIndex(Idx);
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB->begin());
|
|
const bool IsTailCall = BC.MIB->isTailCall(Inst);
|
|
const bool HasIndirectCallProfile =
|
|
BC.MIB->hasAnnotation(Inst, "CallProfile");
|
|
const bool IsJumpTable = Function.getJumpTable(Inst);
|
|
|
|
if (BC.MIB->isCall(Inst))
|
|
TotalCalls += BB->getKnownExecutionCount();
|
|
|
|
if (IsJumpTable && !OptimizeJumpTables)
|
|
continue;
|
|
|
|
if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
|
|
continue;
|
|
|
|
// Ignore direct calls.
|
|
if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
|
|
continue;
|
|
|
|
assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) &&
|
|
"expected a call or an indirect jump instruction");
|
|
|
|
if (IsJumpTable)
|
|
++TotalJumpTableCallsites;
|
|
else
|
|
++TotalIndirectCallsites;
|
|
|
|
std::vector<Callsite> Targets = getCallTargets(*BB, Inst);
|
|
|
|
// Compute the total number of calls from this particular callsite.
|
|
uint64_t NumCalls = 0;
|
|
for (const Callsite &BInfo : Targets)
|
|
NumCalls += BInfo.Branches;
|
|
if (!IsJumpTable)
|
|
FuncTotalIndirectCalls += NumCalls;
|
|
else
|
|
FuncTotalIndirectJmps += NumCalls;
|
|
|
|
// If FLAGS regs is alive after this jmp site, do not try
|
|
// promoting because we will clobber FLAGS.
|
|
if (IsJumpTable) {
|
|
ErrorOr<const BitVector &> State =
|
|
Info.getLivenessAnalysis().getStateBefore(Inst);
|
|
if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
|
|
if (opts::Verbosity >= 1)
|
|
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
|
<< InstIdx << " in " << BB->getName()
|
|
<< ", calls = " << NumCalls
|
|
<< (State ? ", cannot clobber flags reg.\n"
|
|
: ", no liveness data available.\n");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Should this callsite be optimized? Return the number of targets
|
|
// to use when promoting this call. A value of zero means to skip
|
|
// this callsite.
|
|
size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls);
|
|
|
|
// If it is a jump table and it failed to meet our initial threshold,
|
|
// proceed to findCallTargetSymbols -- it may reevaluate N if
|
|
// memory profile is present
|
|
if (!N && !IsJumpTable)
|
|
continue;
|
|
|
|
if (opts::Verbosity >= 1)
|
|
printCallsiteInfo(*BB, Inst, Targets, N, NumCalls);
|
|
|
|
// Find MCSymbols or absolute addresses for each call target.
|
|
MCInst *TargetFetchInst = nullptr;
|
|
const SymTargetsType SymTargets =
|
|
findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst);
|
|
|
|
// findCallTargetSymbols may have changed N if mem profile is available
|
|
// for jump tables
|
|
if (!N)
|
|
continue;
|
|
|
|
LLVM_DEBUG(printDecision(dbgs(), Targets, N));
|
|
|
|
// If we can't resolve any of the target symbols, punt on this callsite.
|
|
// TODO: can this ever happen?
|
|
if (SymTargets.size() < N) {
|
|
const size_t LastTarget = SymTargets.size();
|
|
if (opts::Verbosity >= 1)
|
|
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
|
<< InstIdx << " in " << BB->getName()
|
|
<< ", calls = " << NumCalls
|
|
<< ", ICP failed to find target symbol for "
|
|
<< Targets[LastTarget].To.Sym->getName() << "\n";
|
|
continue;
|
|
}
|
|
|
|
MethodInfoType MethodInfo;
|
|
|
|
if (!IsJumpTable) {
|
|
MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets);
|
|
TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
|
|
LLVM_DEBUG(dbgs()
|
|
<< "BOLT-INFO: ICP "
|
|
<< (!MethodInfo.first.empty() ? "found" : "did not find")
|
|
<< " vtables for all methods.\n");
|
|
} else if (TargetFetchInst) {
|
|
++TotalIndexBasedJumps;
|
|
MethodInfo.second.push_back(TargetFetchInst);
|
|
}
|
|
|
|
// Generate new promoted call code for this callsite.
|
|
MCPlusBuilder::BlocksVectorTy ICPcode =
|
|
(IsJumpTable && !opts::ICPJumpTablesByTarget)
|
|
? BC.MIB->jumpTablePromotion(Inst, SymTargets,
|
|
MethodInfo.second, BC.Ctx.get())
|
|
: BC.MIB->indirectCallPromotion(
|
|
Inst, SymTargets, MethodInfo.first, MethodInfo.second,
|
|
opts::ICPOldCodeSequence, BC.Ctx.get());
|
|
|
|
if (ICPcode.empty()) {
|
|
if (opts::Verbosity >= 1)
|
|
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
|
<< InstIdx << " in " << BB->getName()
|
|
<< ", calls = " << NumCalls
|
|
<< ", unable to generate promoted call code.\n";
|
|
continue;
|
|
}
|
|
|
|
LLVM_DEBUG({
|
|
uint64_t Offset = Targets[0].From.Addr;
|
|
dbgs() << "BOLT-INFO: ICP indirect call code:\n";
|
|
for (const auto &entry : ICPcode) {
|
|
const MCSymbol *const &Sym = entry.first;
|
|
const InstructionListType &Insts = entry.second;
|
|
if (Sym)
|
|
dbgs() << Sym->getName() << ":\n";
|
|
Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(),
|
|
Offset);
|
|
}
|
|
dbgs() << "---------------------------------------------------\n";
|
|
});
|
|
|
|
// Rewrite the CFG with the newly generated ICP code.
|
|
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
|
|
rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second);
|
|
|
|
// Fix the CFG after inserting the new basic blocks.
|
|
BinaryBasicBlock *MergeBlock =
|
|
fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets);
|
|
|
|
// Since the tail of the original block was split off and it may contain
|
|
// additional indirect calls, we must add the merge block to the set of
|
|
// blocks to process.
|
|
if (MergeBlock)
|
|
BBs.push_back(MergeBlock);
|
|
|
|
if (opts::Verbosity >= 1)
|
|
outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
|
|
<< InstIdx << " in " << BB->getName()
|
|
<< " -> calls = " << NumCalls << "\n";
|
|
|
|
if (IsJumpTable)
|
|
++TotalOptimizedJumpTableCallsites;
|
|
else
|
|
++TotalOptimizedIndirectCallsites;
|
|
|
|
Modified.insert(&Function);
|
|
}
|
|
}
|
|
TotalIndirectCalls += FuncTotalIndirectCalls;
|
|
TotalIndirectJmps += FuncTotalIndirectJmps;
|
|
}
|
|
|
|
outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
|
|
<< TotalIndirectCallsites << "\n"
|
|
<< "BOLT-INFO: ICP total jump table callsites = "
|
|
<< TotalJumpTableCallsites << "\n"
|
|
<< "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
|
|
<< "BOLT-INFO: ICP percentage of calls that are indirect = "
|
|
<< format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
|
|
<< "BOLT-INFO: ICP percentage of indirect calls that can be "
|
|
"optimized = "
|
|
<< format("%.1f", (100.0 * TotalNumFrequentCalls) /
|
|
std::max<size_t>(TotalIndirectCalls, 1))
|
|
<< "%\n"
|
|
<< "BOLT-INFO: ICP percentage of indirect callsites that are "
|
|
"optimized = "
|
|
<< format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
|
|
std::max<uint64_t>(TotalIndirectCallsites, 1))
|
|
<< "%\n"
|
|
<< "BOLT-INFO: ICP number of method load elimination candidates = "
|
|
<< TotalMethodLoadEliminationCandidates << "\n"
|
|
<< "BOLT-INFO: ICP percentage of method calls candidates that have "
|
|
"loads eliminated = "
|
|
<< format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
|
|
std::max<uint64_t>(
|
|
TotalMethodLoadEliminationCandidates, 1))
|
|
<< "%\n"
|
|
<< "BOLT-INFO: ICP percentage of indirect branches that are "
|
|
"optimized = "
|
|
<< format("%.1f", (100.0 * TotalNumFrequentJmps) /
|
|
std::max<uint64_t>(TotalIndirectJmps, 1))
|
|
<< "%\n"
|
|
<< "BOLT-INFO: ICP percentage of jump table callsites that are "
|
|
<< "optimized = "
|
|
<< format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
|
|
std::max<uint64_t>(TotalJumpTableCallsites, 1))
|
|
<< "%\n"
|
|
<< "BOLT-INFO: ICP number of jump table callsites that can use hot "
|
|
<< "indices = " << TotalIndexBasedCandidates << "\n"
|
|
<< "BOLT-INFO: ICP percentage of jump table callsites that use hot "
|
|
"indices = "
|
|
<< format("%.1f", (100.0 * TotalIndexBasedJumps) /
|
|
std::max<uint64_t>(TotalIndexBasedCandidates, 1))
|
|
<< "%\n";
|
|
|
|
(void)verifyProfile;
|
|
#ifndef NDEBUG
|
|
verifyProfile(BFs);
|
|
#endif
|
|
}
|
|
|
|
} // namespace bolt
|
|
} // namespace llvm
|