forked from OSchip/llvm-project
Indirect call promotion optimization.
Summary: Perform indirect call promotion optimization in BOLT. The code scans the instructions during CFG creation for all indirect calls. Right now indirect tail calls are not handled since the functions are marked not simple. The offsets of the indirect calls are stored for later use by the ICP pass. The indirect call promotion pass visits each indirect call and examines the BranchData for each. If the most frequent targets from that callsite exceed the specified threshold (default 90%), the call is promoted. Otherwise, it is ignored. By default, only one target is considered at each callsite. When an candiate callsite is processed, we modify the callsite to test for the most common call targets before calling through the original generic call mechanism. The CFG and layout are modified by ICP. A few new command line options have been added: -indirect-call-promotion -indirect-call-promotion-threshold=<percentage> -indirect-call-promotion-topn=<int> The threshold is the minimum frequency of a call target needed before ICP is triggered. The topn option controls the number of targets to consider for each callsite, e.g. ICP is triggered if topn=2 and the total requency of the top two call targets exceeds the threshold. Example of ICP: C++ code: int B_count = 0; int C_count = 0; struct A { virtual void foo() = 0; } struct B : public A { virtual void foo() { ++B_count; }; }; struct C : public A { virtual void foo() { ++C_count; }; }; A* a = ... a->foo(); ... original: 400863: 49 8b 07 mov (%r15),%rax 400866: 4c 89 ff mov %r15,%rdi 400869: ff 10 callq *(%rax) 40086b: 41 83 e6 01 and $0x1,%r14d 40086f: 4d 89 e6 mov %r12,%r14 400872: 4c 0f 44 f5 cmove %rbp,%r14 400876: 4c 89 f7 mov %r14,%rdi ... after ICP: 40085e: 49 8b 07 mov (%r15),%rax 400861: 4c 89 ff mov %r15,%rdi 400864: 49 ba e0 0b 40 00 00 movabs $0x400be0,%r10 40086b: 00 00 00 40086e: 4c 3b 10 cmp (%rax),%r10 400871: 75 29 jne 40089c <main+0x9c> 400873: 41 ff d2 callq *%r10 400876: 41 83 e6 01 and $0x1,%r14d 40087a: 4d 89 e6 mov %r12,%r14 40087d: 4c 0f 44 f5 cmove %rbp,%r14 400881: 4c 89 f7 mov %r14,%rdi ... 40089c: ff 10 callq *(%rax) 40089e: eb d6 jmp 400876 <main+0x76> (cherry picked from FBD3612218)
This commit is contained in:
parent
6ff1795d96
commit
d74997c3cc
|
@ -29,6 +29,12 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
|
|||
return LHS.Index < RHS.Index;
|
||||
}
|
||||
|
||||
void BinaryBasicBlock::adjustNumPseudos(const MCInst &Inst, int Sign) {
|
||||
auto &BC = Function->getBinaryContext();
|
||||
if (BC.MII->get(Inst.getOpcode()).isPseudo())
|
||||
NumPseudos += Sign;
|
||||
}
|
||||
|
||||
MCInst *BinaryBasicBlock::getFirstNonPseudo() {
|
||||
auto &BC = Function->getBinaryContext();
|
||||
for (auto &Inst : Instructions) {
|
||||
|
@ -47,6 +53,34 @@ MCInst *BinaryBasicBlock::getLastNonPseudo() {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
bool BinaryBasicBlock::validateSuccessorInvariants() {
|
||||
const MCSymbol *TBB = nullptr;
|
||||
const MCSymbol *FBB = nullptr;
|
||||
MCInst *CondBranch = nullptr;
|
||||
MCInst *UncondBranch = nullptr;
|
||||
|
||||
assert(getNumPseudos() == getNumPseudos());
|
||||
|
||||
if (analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
|
||||
switch (Successors.size()) {
|
||||
case 0:
|
||||
return !CondBranch && !UncondBranch;
|
||||
case 1:
|
||||
return !CondBranch;
|
||||
case 2:
|
||||
if (CondBranch) {
|
||||
return (TBB == getConditionalSuccessor(true)->getLabel() &&
|
||||
((!UncondBranch && !FBB) ||
|
||||
(UncondBranch && FBB == getConditionalSuccessor(false)->getLabel())));
|
||||
}
|
||||
return true;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const {
|
||||
if (!Label && succ_size() == 1)
|
||||
return *succ_begin();
|
||||
|
@ -121,14 +155,16 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) {
|
|||
}
|
||||
|
||||
void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) {
|
||||
LandingPads.push_back(LPBlock);
|
||||
if (std::find(LandingPads.begin(), LandingPads.end(), LPBlock) == LandingPads.end()) {
|
||||
LandingPads.push_back(LPBlock);
|
||||
}
|
||||
LPBlock->Throwers.insert(this);
|
||||
}
|
||||
|
||||
void BinaryBasicBlock::clearLandingPads() {
|
||||
for (auto *LPBlock : LandingPads) {
|
||||
auto count = LPBlock->Throwers.erase(this);
|
||||
assert(count == 1);
|
||||
assert(count == 1 && "Possible duplicate entry in LandingPads");
|
||||
}
|
||||
LandingPads.clear();
|
||||
}
|
||||
|
|
|
@ -376,6 +376,7 @@ public:
|
|||
/// Add instruction at the end of this basic block.
|
||||
/// Returns the index of the instruction in the Instructions vector of the BB.
|
||||
uint32_t addInstruction(MCInst &&Inst) {
|
||||
adjustNumPseudos(Inst, 1);
|
||||
Instructions.emplace_back(Inst);
|
||||
return Instructions.size() - 1;
|
||||
}
|
||||
|
@ -383,6 +384,7 @@ public:
|
|||
/// Add instruction at the end of this basic block.
|
||||
/// Returns the index of the instruction in the Instructions vector of the BB.
|
||||
uint32_t addInstruction(const MCInst &Inst) {
|
||||
adjustNumPseudos(Inst, 1);
|
||||
Instructions.push_back(Inst);
|
||||
return Instructions.size() - 1;
|
||||
}
|
||||
|
@ -435,6 +437,10 @@ public:
|
|||
uint64_t Count = 0,
|
||||
uint64_t MispredictedCount = 0);
|
||||
|
||||
void addSuccessor(BinaryBasicBlock *Succ, const BinaryBranchInfo &BI) {
|
||||
addSuccessor(Succ, BI.Count, BI.MispredictedCount);
|
||||
}
|
||||
|
||||
/// Add a range of successors.
|
||||
template <typename Itr>
|
||||
void addSuccessors(Itr Begin, Itr End) {
|
||||
|
@ -448,8 +454,7 @@ public:
|
|||
void addSuccessors(Itr Begin, Itr End, BrItr BrBegin, BrItr BrEnd) {
|
||||
assert(std::distance(Begin, End) == std::distance(BrBegin, BrEnd));
|
||||
while (Begin != End) {
|
||||
const auto BrInfo = *BrBegin++;
|
||||
addSuccessor(*Begin++, BrInfo.Count, BrInfo.MispredictedCount);
|
||||
addSuccessor(*Begin++, *BrBegin++);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -551,20 +556,22 @@ public:
|
|||
/// Replace an instruction with a sequence of instructions. Returns true
|
||||
/// if the instruction to be replaced was found and replaced.
|
||||
template <typename Itr>
|
||||
bool replaceInstruction(MCInst *Inst, Itr Begin, Itr End) {
|
||||
bool replaceInstruction(const MCInst *Inst, Itr Begin, Itr End) {
|
||||
auto I = Instructions.end();
|
||||
auto B = Instructions.begin();
|
||||
while (I > B) {
|
||||
--I;
|
||||
if (&*I == Inst) {
|
||||
adjustNumPseudos(*Inst, -1);
|
||||
Instructions.insert(Instructions.erase(I), Begin, End);
|
||||
adjustNumPseudos(Begin, End, 1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool replaceInstruction(MCInst *Inst,
|
||||
bool replaceInstruction(const MCInst *Inst,
|
||||
const std::vector<MCInst> &Replacement) {
|
||||
return replaceInstruction(Inst, Replacement.begin(), Replacement.end());
|
||||
}
|
||||
|
@ -580,7 +587,8 @@ public:
|
|||
Instructions.pop_back();
|
||||
}
|
||||
std::reverse(SplitInst.begin(), SplitInst.end());
|
||||
|
||||
NumPseudos = 0;
|
||||
adjustNumPseudos(Instructions.begin(), Instructions.end(), 1);
|
||||
return SplitInst;
|
||||
}
|
||||
|
||||
|
@ -626,7 +634,18 @@ public:
|
|||
/// A simple dump function for debugging.
|
||||
void dump() const;
|
||||
|
||||
/// Validate successor invariants for this BB.
|
||||
bool validateSuccessorInvariants();
|
||||
|
||||
private:
|
||||
void adjustNumPseudos(const MCInst &Inst, int Sign);
|
||||
|
||||
template <typename Itr>
|
||||
void adjustNumPseudos(Itr Begin, Itr End, int Sign) {
|
||||
while (Begin != End) {
|
||||
adjustNumPseudos(*Begin++, Sign);
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds predecessor to the BB. Most likely you don't need to call this.
|
||||
void addPredecessor(BinaryBasicBlock *Pred);
|
||||
|
|
|
@ -370,6 +370,14 @@ void BinaryContext::printInstruction(raw_ostream &OS,
|
|||
}
|
||||
}
|
||||
|
||||
MIA->forEachAnnotation(
|
||||
Instruction,
|
||||
[&OS](const MCAnnotation *Annotation) {
|
||||
OS << " # " << Annotation->getName() << ": ";
|
||||
Annotation->print(OS);
|
||||
}
|
||||
);
|
||||
|
||||
const DWARFDebugLine::LineTable *LineTable =
|
||||
Function && opts::PrintDebugInfo ? Function->getDWARFUnitLineTable().second
|
||||
: nullptr;
|
||||
|
|
|
@ -910,6 +910,7 @@ void BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
|
|||
|
||||
auto &Ctx = BC.Ctx;
|
||||
auto &MIA = BC.MIA;
|
||||
auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames());
|
||||
|
||||
DWARFUnitLineTable ULT = getDWARFUnitLineTable();
|
||||
|
||||
|
@ -1122,12 +1123,6 @@ void BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
|
|||
}
|
||||
}
|
||||
|
||||
Instruction.clear();
|
||||
Instruction.addOperand(
|
||||
MCOperand::createExpr(
|
||||
MCSymbolRefExpr::create(TargetSymbol,
|
||||
MCSymbolRefExpr::VK_None,
|
||||
*Ctx)));
|
||||
if (!IsCall) {
|
||||
// Add taken branch info.
|
||||
TakenBranches.emplace_back(Offset, TargetAddress - getAddress());
|
||||
|
@ -1136,6 +1131,21 @@ void BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
|
|||
// Add fallthrough branch info.
|
||||
FTBranches.emplace_back(Offset, Offset + Size);
|
||||
}
|
||||
|
||||
const bool isIndirect =
|
||||
((IsCall || !IsCondBranch) && MIA->isIndirectBranch(Instruction));
|
||||
|
||||
Instruction.clear();
|
||||
Instruction.addOperand(
|
||||
MCOperand::createExpr(
|
||||
MCSymbolRefExpr::create(TargetSymbol,
|
||||
MCSymbolRefExpr::VK_None,
|
||||
*Ctx)));
|
||||
|
||||
if (isIndirect && BranchDataOrErr) {
|
||||
MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData",
|
||||
Offset);
|
||||
}
|
||||
} else {
|
||||
// Could not evaluate branch. Should be an indirect call or an
|
||||
// indirect branch. Bail out on the latter case.
|
||||
|
@ -1145,7 +1155,14 @@ void BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
|
|||
default:
|
||||
llvm_unreachable("unexpected result");
|
||||
case IndirectBranchType::POSSIBLE_TAIL_CALL:
|
||||
MIA->convertJmpToTailCall(Instruction);
|
||||
{
|
||||
auto Result = MIA->convertJmpToTailCall(Instruction);
|
||||
assert(Result);
|
||||
if (BranchDataOrErr) {
|
||||
MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData",
|
||||
Offset);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case IndirectBranchType::POSSIBLE_JUMP_TABLE:
|
||||
case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE:
|
||||
|
@ -1155,8 +1172,19 @@ void BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
|
|||
case IndirectBranchType::UNKNOWN:
|
||||
// Keep processing. We'll do more checks and fixes in
|
||||
// postProcessIndirectBranches().
|
||||
if (BranchDataOrErr) {
|
||||
MIA->addAnnotation(Ctx.get(),
|
||||
Instruction,
|
||||
"MaybeIndirectBranchData",
|
||||
Offset);
|
||||
}
|
||||
break;
|
||||
};
|
||||
} else if (MIA->isCall(Instruction)) {
|
||||
if (BranchDataOrErr) {
|
||||
MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData",
|
||||
Offset);
|
||||
}
|
||||
}
|
||||
// Indirect call. We only need to fix it if the operand is RIP-relative
|
||||
if (IsSimple && MIA->hasRIPOperand(Instruction)) {
|
||||
|
@ -1248,6 +1276,8 @@ void BinaryFunction::postProcessJumpTables() {
|
|||
}
|
||||
|
||||
bool BinaryFunction::postProcessIndirectBranches() {
|
||||
auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames());
|
||||
|
||||
for (auto *BB : layout()) {
|
||||
for (auto &Instr : *BB) {
|
||||
if (!BC.MIA->isIndirectBranch(Instr))
|
||||
|
@ -1257,6 +1287,15 @@ bool BinaryFunction::postProcessIndirectBranches() {
|
|||
// it must be a tail call.
|
||||
if (layout_size() == 1) {
|
||||
BC.MIA->convertJmpToTailCall(Instr);
|
||||
|
||||
if (BC.MIA->hasAnnotation(Instr, "MaybeIndirectBranchData")) {
|
||||
auto Offset =
|
||||
BC.MIA->getAnnotationAs<uint64_t>(Instr, "MaybeIndirectBranchData");
|
||||
BC.MIA->addAnnotation(BC.Ctx.get(),
|
||||
Instr,
|
||||
"IndirectBranchData",
|
||||
Offset);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1336,6 +1375,15 @@ bool BinaryFunction::postProcessIndirectBranches() {
|
|||
return false;
|
||||
}
|
||||
BC.MIA->convertJmpToTailCall(Instr);
|
||||
|
||||
if (BranchDataOrErr) {
|
||||
auto Offset =
|
||||
BC.MIA->getAnnotationAs<uint64_t>(Instr, "MaybeIndirectBranchData");
|
||||
BC.MIA->addAnnotation(BC.Ctx.get(),
|
||||
Instr,
|
||||
"IndirectBranchData",
|
||||
Offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -2097,8 +2145,8 @@ void BinaryFunction::removeConditionalTailCalls() {
|
|||
// We have to add 1 byte as there's potentially an existing branch past
|
||||
// the end of the code as a result of __builtin_unreachable().
|
||||
const BinaryBasicBlock *LastBB = BasicBlocks.back();
|
||||
uint64_t NewBlockOffset = LastBB->getOffset() +
|
||||
BC.computeCodeSize(LastBB->begin(), LastBB->end()) + 1;
|
||||
uint64_t NewBlockOffset =
|
||||
LastBB->getOffset() + BC.computeCodeSize(LastBB->begin(), LastBB->end()) + 1;
|
||||
TailCallBB = addBasicBlock(NewBlockOffset, TCLabel);
|
||||
TailCallBB->addInstruction(TailCallInst);
|
||||
|
||||
|
@ -2184,6 +2232,7 @@ BinaryFunction::annotateCFIState(const MCInst *Stop) {
|
|||
} else if (CFI->getOperation() != MCCFIInstruction::OpGnuArgsSize) {
|
||||
State = HighestState;
|
||||
}
|
||||
assert(State <= FrameInstructions.size());
|
||||
++Idx;
|
||||
if (&Instr == Stop) {
|
||||
CFIState.emplace_back(State);
|
||||
|
@ -2315,9 +2364,9 @@ bool BinaryFunction::fixCFIState() {
|
|||
|
||||
if (StackOffset != 0) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
errs() << " BOLT-WARNING: not possible to remember/recover state"
|
||||
errs() << "BOLT-WARNING: not possible to remember/recover state"
|
||||
<< " without corrupting CFI state stack in function "
|
||||
<< *this << "\n";
|
||||
<< *this << " @ " << BB->getName() << "\n";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -2616,12 +2665,18 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const {
|
|||
BasicBlocksLayout.end(),
|
||||
BB);
|
||||
unsigned Layout = LayoutPos - BasicBlocksLayout.begin();
|
||||
OS << format("\"%s\" [label=\"%s\\n(O:%lu,I:%u,L%u)\"]\n",
|
||||
const char* ColdStr = BB->isCold() ? " (cold)" : "";
|
||||
OS << format("\"%s\" [label=\"%s%s\\n(C:%lu,O:%lu,I:%u,L:%u:CFI:%u)\"]\n",
|
||||
BB->getName().data(),
|
||||
BB->getName().data(),
|
||||
ColdStr,
|
||||
(BB->ExecutionCount != BinaryBasicBlock::COUNT_NO_PROFILE
|
||||
? BB->ExecutionCount
|
||||
: 0),
|
||||
BB->getOffset(),
|
||||
getIndex(BB),
|
||||
Layout);
|
||||
Layout,
|
||||
BBCFIState[getIndex(BB)]);
|
||||
OS << format("\"%s\" [shape=box]\n", BB->getName().data());
|
||||
if (opts::DotToolTipCode) {
|
||||
std::string Str;
|
||||
|
@ -2673,7 +2728,7 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const {
|
|||
|
||||
if (BB->getExecutionCount() != COUNT_NO_PROFILE &&
|
||||
BI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) {
|
||||
OS << "\\n(M:" << BI->MispredictedCount << ",C:" << BI->Count << ")";
|
||||
OS << "\\n(C:" << BI->Count << ",M:" << BI->MispredictedCount << ")";
|
||||
} else if (ExecutionCount != COUNT_NO_PROFILE &&
|
||||
BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
|
||||
OS << "\\n(IC:" << BI->Count << ")";
|
||||
|
@ -2727,6 +2782,41 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const {
|
|||
dumpGraph(of);
|
||||
}
|
||||
|
||||
bool BinaryFunction::validateCFG() {
|
||||
bool Valid = true;
|
||||
for (auto *BB : BasicBlocks) {
|
||||
Valid &= BB->validateSuccessorInvariants();
|
||||
if (!Valid) {
|
||||
errs() << "BOLT-WARNING: CFG invalid @ " << BB->getName() << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (!Valid)
|
||||
return Valid;
|
||||
|
||||
for (auto *BB : BasicBlocks) {
|
||||
std::set<BinaryBasicBlock *> Seen;
|
||||
for (auto *LPBlock : BB->LandingPads) {
|
||||
Valid &= Seen.count(LPBlock) == 0;
|
||||
if (!Valid) {
|
||||
errs() << "Duplicate LP seen " << LPBlock->getName() << "\n";
|
||||
break;
|
||||
}
|
||||
Seen.insert(LPBlock);
|
||||
auto count = LPBlock->Throwers.count(BB);
|
||||
Valid &= (count == 1);
|
||||
if (!Valid) {
|
||||
errs() << "Inconsistent landing pad detected " << LPBlock->getName()
|
||||
<< " is in LandingPads but not in " << BB->getName()
|
||||
<< "->Throwers\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Valid;
|
||||
}
|
||||
|
||||
void BinaryFunction::fixBranches() {
|
||||
auto &MIA = BC.MIA;
|
||||
auto *Ctx = BC.Ctx.get();
|
||||
|
@ -2778,6 +2868,7 @@ void BinaryFunction::fixBranches() {
|
|||
// terminator) or more than 2 (switch table) don't require branch
|
||||
// instruction adjustments.
|
||||
}
|
||||
assert(validateCFG());
|
||||
}
|
||||
|
||||
void BinaryFunction::splitFunction() {
|
||||
|
@ -3257,6 +3348,7 @@ void BinaryFunction::updateLayout(BinaryBasicBlock* Start,
|
|||
auto Begin = &BasicBlocks[getIndex(Start) + 1];
|
||||
auto End = &BasicBlocks[getIndex(Start) + NumNewBlocks + 1];
|
||||
BasicBlocksLayout.insert(Pos + 1, Begin, End);
|
||||
updateLayoutIndices();
|
||||
}
|
||||
|
||||
void BinaryFunction::updateLayout(LayoutType Type,
|
||||
|
@ -3265,6 +3357,7 @@ void BinaryFunction::updateLayout(LayoutType Type,
|
|||
// Recompute layout with original parameters.
|
||||
BasicBlocksLayout = BasicBlocks;
|
||||
modifyLayout(Type, MinBranchClusters, Split);
|
||||
updateLayoutIndices();
|
||||
}
|
||||
|
||||
bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
|
||||
|
|
|
@ -80,7 +80,7 @@ class DynoStats {
|
|||
Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
|
||||
D(ALL_CONDITIONAL, "all conditional branches",\
|
||||
Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
|
||||
D(LAST_DYNO_STAT, "<reserved>", Fn)
|
||||
D(LAST_DYNO_STAT, "<reserved>", 0)
|
||||
|
||||
public:
|
||||
#define D(name, ...) name,
|
||||
|
@ -839,6 +839,9 @@ public:
|
|||
return BC;
|
||||
}
|
||||
|
||||
/// Attempt to validate CFG invariants.
|
||||
bool validateCFG();
|
||||
|
||||
/// Return dynostats for the function.
|
||||
///
|
||||
/// The function relies on branch instructions being in-sync with CFG for
|
||||
|
@ -1161,9 +1164,8 @@ public:
|
|||
|
||||
/// Insert the BBs contained in NewBBs into the basic blocks for this
|
||||
/// function. Update the associated state of all blocks as needed, i.e.
|
||||
/// BB offsets, BB indices, and optionally CFI state. The new BBs are
|
||||
/// inserted after Start. This operation could affect fallthrough branches
|
||||
/// for Start.
|
||||
/// BB offsets and BB indices. The new BBs are inserted after Start.
|
||||
/// This operation could affect fallthrough branches for Start.
|
||||
///
|
||||
void insertBasicBlocks(
|
||||
BinaryBasicBlock *Start,
|
||||
|
|
|
@ -17,9 +17,9 @@ using namespace llvm;
|
|||
|
||||
namespace opts {
|
||||
|
||||
extern llvm::cl::opt<bool> PrintAll;
|
||||
extern llvm::cl::opt<bool> DumpDotAll;
|
||||
extern llvm::cl::opt<bool> DynoStatsAll;
|
||||
extern cl::opt<bool> PrintAll;
|
||||
extern cl::opt<bool> DumpDotAll;
|
||||
extern cl::opt<bool> DynoStatsAll;
|
||||
|
||||
llvm::cl::opt<bool> TimeOpts("time-opts",
|
||||
cl::desc("print time spent in each optimization"),
|
||||
|
@ -37,6 +37,11 @@ OptimizeBodylessFunctions(
|
|||
cl::desc("optimize functions that just do a tail call"),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
static cl::opt<bool>
|
||||
IndirectCallPromotion("indirect-call-promotion",
|
||||
cl::desc("indirect call promotion"),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
static cl::opt<bool>
|
||||
InlineSmallFunctions(
|
||||
"inline-small-functions",
|
||||
|
@ -118,6 +123,12 @@ PrintICF("print-icf",
|
|||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintICP("print-icp",
|
||||
cl::desc("print functions after indirect call promotion"),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintInline("print-inline",
|
||||
cl::desc("print functions after inlining optimization"),
|
||||
|
@ -197,7 +208,7 @@ void BinaryFunctionPassManager::runAllPasses(
|
|||
) {
|
||||
BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions);
|
||||
|
||||
// Here we manage dependencies/order manually, since passes are ran in the
|
||||
// Here we manage dependencies/order manually, since passes are run in the
|
||||
// order they're registered.
|
||||
|
||||
// Run this pass first to use stats for the original functions.
|
||||
|
@ -205,6 +216,12 @@ void BinaryFunctionPassManager::runAllPasses(
|
|||
|
||||
Manager.registerPass(llvm::make_unique<IdenticalCodeFolding>(PrintICF));
|
||||
|
||||
Manager.registerPass(llvm::make_unique<IndirectCallPromotion>(PrintICP),
|
||||
opts::IndirectCallPromotion);
|
||||
|
||||
Manager.registerPass(llvm::make_unique<Peepholes>(PrintPeepholes),
|
||||
opts::Peepholes);
|
||||
|
||||
Manager.registerPass(llvm::make_unique<InlineSmallFunctions>(PrintInline),
|
||||
opts::InlineSmallFunctions);
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ private:
|
|||
/// Runs all enabled implemented passes on all functions.
|
||||
static void runAllPasses(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &Functions,
|
||||
std::set<uint64_t> &largeFunctions);
|
||||
std::set<uint64_t> &LargeFunctions);
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -65,6 +65,53 @@ AggressiveInlining("aggressive-inlining",
|
|||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionThreshold(
|
||||
"indirect-call-promotion-threshold",
|
||||
cl::desc("threshold for optimizing a frequently taken indirect call"),
|
||||
cl::init(90),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionMispredictThreshold(
|
||||
"indirect-call-promotion-mispredict-threshold",
|
||||
cl::desc("misprediction threshold for skipping ICP on an "
|
||||
"indirect call"),
|
||||
cl::init(2),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
static cl::opt<bool>
|
||||
IndirectCallPromotionUseMispredicts(
|
||||
"indirect-call-promotion-use-mispredicts",
|
||||
cl::desc("use misprediction frequency for determining whether or not ICP "
|
||||
"should be applied at a callsite. The "
|
||||
"-indirect-call-promotion-mispredict-threshold value will be used "
|
||||
"by this heuristic"),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
static cl::opt<unsigned>
|
||||
IndirectCallPromotionTopN(
|
||||
"indirect-call-promotion-topn",
|
||||
cl::desc("number of targets to consider when doing indirect "
|
||||
"call promotion"),
|
||||
cl::init(1),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
static cl::list<std::string>
|
||||
ICPFuncsList("icp-funcs",
|
||||
cl::CommaSeparated,
|
||||
cl::desc("list of functions to enable ICP for"),
|
||||
cl::value_desc("func1,func2,func3,..."),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
ICPOldCodeSequence(
|
||||
"icp-old-code-sequence",
|
||||
cl::desc("use old code sequence for promoted calls"),
|
||||
cl::init(false),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bolt::BinaryFunction::LayoutType>
|
||||
ReorderBlocks(
|
||||
"reorder-blocks",
|
||||
|
@ -274,9 +321,11 @@ void InlineSmallFunctions::findInliningCandidates(
|
|||
continue;
|
||||
auto &BB = *Function.begin();
|
||||
const auto &LastInstruction = *BB.rbegin();
|
||||
// Check if the function is small enough and doesn't do a tail call.
|
||||
// Check if the function is small enough, doesn't do a tail call
|
||||
// and doesn't throw exceptions.
|
||||
if (BB.size() > 0 &&
|
||||
BB.getNumNonPseudos() <= kMaxInstructions &&
|
||||
BB.lp_empty() &&
|
||||
BC.MIA->isReturn(LastInstruction) &&
|
||||
!BC.MIA->isTailCall(LastInstruction)) {
|
||||
InliningCandidates.insert(&Function);
|
||||
|
@ -634,10 +683,7 @@ InlineSmallFunctions::inlineCall(
|
|||
CallerBB->getExecutionCount());
|
||||
}
|
||||
}
|
||||
unsigned NumBlocksToAdd = InlinedInstance.size();
|
||||
CallerFunction.insertBasicBlocks(CallerBB, std::move(InlinedInstance));
|
||||
CallerFunction.updateLayout(CallerBB, NumBlocksToAdd);
|
||||
CallerFunction.fixBranches();
|
||||
|
||||
return std::make_pair(AfterInlinedBB, AfterInlinedIstrIndex);
|
||||
}
|
||||
|
@ -1020,6 +1066,14 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
|
|||
MCInst *CondBranch = nullptr;
|
||||
MCInst *UncondBranch = nullptr;
|
||||
auto Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
|
||||
|
||||
// analyzeBranch can fail due to unusual branch instructions, e.g. jrcxz
|
||||
if (!Result) {
|
||||
DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
|
||||
PredBB->dump());
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(Result && "internal error analyzing conditional branch");
|
||||
assert(CondBranch && "conditional branch expected");
|
||||
|
||||
|
@ -1116,6 +1170,10 @@ void Peepholes::shortenInstructions(BinaryContext &BC,
|
|||
}
|
||||
}
|
||||
|
||||
void debugDump(BinaryFunction *BF) {
|
||||
BF->dump();
|
||||
}
|
||||
|
||||
// This peephole fixes jump instructions that jump to another basic
|
||||
// block with a single jump instruction, e.g.
|
||||
//
|
||||
|
@ -1140,7 +1198,7 @@ void Peepholes::fixDoubleJumps(BinaryContext &BC,
|
|||
return;
|
||||
|
||||
if (Succ) {
|
||||
Pred->replaceSuccessor(&BB, Succ, BinaryBasicBlock::COUNT_NO_PROFILE);
|
||||
Pred->replaceSuccessor(&BB, Succ);
|
||||
} else {
|
||||
// Succ will be null in the tail call case. In this case we
|
||||
// need to explicitly add a tail call instruction.
|
||||
|
@ -1216,8 +1274,8 @@ void Peepholes::runOnFunctions(BinaryContext &BC,
|
|||
addTailcallTraps(BC, Function);
|
||||
}
|
||||
}
|
||||
outs() << "BOLT-INFO: " << NumDoubleJumps << " double jumps patched.\n";
|
||||
outs() << "BOLT-INFO: " << TailCallTraps << " tail call traps inserted.\n";
|
||||
outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps << " double jumps patched.\n";
|
||||
outs() << "BOLT-INFO: Peephole: " << TailCallTraps << " tail call traps inserted.\n";
|
||||
}
|
||||
|
||||
bool SimplifyRODataLoads::simplifyRODataLoads(
|
||||
|
@ -1550,5 +1608,541 @@ void PrintSortedBy::runOnFunctions(
|
|||
}
|
||||
}
|
||||
|
||||
// Get list of targets for a given call sorted by most frequently
|
||||
// called first.
|
||||
std::vector<BranchInfo> IndirectCallPromotion::getCallTargets(
|
||||
BinaryContext &BC,
|
||||
const FuncBranchData &BranchData,
|
||||
const MCInst &Inst
|
||||
) const {
|
||||
auto Offset = BC.MIA->getAnnotationAs<uint64_t>(Inst, "IndirectBranchData");
|
||||
auto Branches = BranchData.getBranchRange(Offset);
|
||||
std::vector<BranchInfo> Targets(Branches.begin(), Branches.end());
|
||||
|
||||
// Sort by most commonly called targets.
|
||||
std::sort(Targets.begin(), Targets.end(),
|
||||
[](const BranchInfo &A, const BranchInfo &B) {
|
||||
return A.Branches > B.Branches;
|
||||
});
|
||||
|
||||
return Targets;
|
||||
}
|
||||
|
||||
std::vector<std::pair<MCSymbol *, uint64_t>>
|
||||
IndirectCallPromotion::findCallTargetSymbols(
|
||||
BinaryContext &BC,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
const size_t N
|
||||
) const {
|
||||
std::vector<std::pair<MCSymbol *, uint64_t>> SymTargets;
|
||||
|
||||
for (size_t I = 0; I < N; ++I) {
|
||||
MCSymbol* Symbol = nullptr;
|
||||
uint64_t Addr = 0;
|
||||
if (Targets[I].To.IsSymbol) {
|
||||
auto itr = BC.GlobalSymbols.find(Targets[I].To.Name);
|
||||
if (itr == BC.GlobalSymbols.end()) {
|
||||
// punt if we can't find a symbol.
|
||||
break;
|
||||
}
|
||||
Symbol = BC.getOrCreateGlobalSymbol(itr->second, "FUNCat");
|
||||
assert(Symbol);
|
||||
} else {
|
||||
Addr = Targets[I].To.Offset;
|
||||
}
|
||||
SymTargets.push_back(std::make_pair(Symbol, Addr));
|
||||
}
|
||||
|
||||
return SymTargets;
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>>
|
||||
IndirectCallPromotion::rewriteCall(BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const MCInst &CallInst,
|
||||
MCInstrAnalysis::ICPdata &&ICPcode) const {
|
||||
// Create new basic blocks with correct code in each one first.
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
|
||||
const bool IsTailCall = BC.MIA->isTailCall(CallInst);
|
||||
|
||||
// Move instructions from the tail of the original call block
|
||||
// to the merge block.
|
||||
|
||||
// Remember any pseudo instructions following a tail call. These
|
||||
// must be preserved and moved to the original block.
|
||||
std::vector<MCInst> TailInsts;
|
||||
const auto *TailInst= &CallInst;
|
||||
if (IsTailCall) {
|
||||
while (TailInst + 1 < &(*IndCallBlock->end()) &&
|
||||
BC.MII->get((TailInst + 1)->getOpcode()).isPseudo()) {
|
||||
TailInsts.push_back(*++TailInst);
|
||||
}
|
||||
}
|
||||
|
||||
auto MovedInst = IndCallBlock->splitInstructions(&CallInst);
|
||||
|
||||
IndCallBlock->replaceInstruction(&CallInst, ICPcode.front().second);
|
||||
IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end());
|
||||
|
||||
for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
|
||||
auto &Sym = Itr->first;
|
||||
auto &Insts = Itr->second;
|
||||
assert(Sym);
|
||||
auto TBB = Function.createBasicBlock(0, Sym);
|
||||
for (auto &Inst : Insts) { // sanitize new instructions.
|
||||
if (BC.MIA->isCall(Inst))
|
||||
BC.MIA->removeAnnotation(Inst, "IndirectBranchData");
|
||||
}
|
||||
TBB->addInstructions(Insts.begin(), Insts.end());
|
||||
NewBBs.emplace_back(std::move(TBB));
|
||||
}
|
||||
|
||||
// Move tail of instructions from after the original call to
|
||||
// the merge block.
|
||||
if (!IsTailCall) {
|
||||
NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
|
||||
}
|
||||
|
||||
return NewBBs;
|
||||
}
|
||||
|
||||
BinaryBasicBlock *IndirectCallPromotion::fixCFG(
|
||||
BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const bool IsTailCall,
|
||||
IndirectCallPromotion::BasicBlocksVector &&NewBBs,
|
||||
const std::vector<BranchInfo> &Targets
|
||||
) const {
|
||||
BinaryBasicBlock *MergeBlock = !IsTailCall ? NewBBs.back().get() : nullptr;
|
||||
assert(NewBBs.size() >= 2);
|
||||
assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty());
|
||||
assert(NewBBs.size() % 2 == 1 || IsTailCall);
|
||||
using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
|
||||
|
||||
if (MergeBlock) {
|
||||
std::vector<BinaryBasicBlock*> OldSucc(IndCallBlock->successors().begin(),
|
||||
IndCallBlock->successors().end());
|
||||
std::vector<BinaryBranchInfo> BranchInfo(IndCallBlock->branch_info_begin(),
|
||||
IndCallBlock->branch_info_end());
|
||||
|
||||
// Remove all successors from block doing the indirect call.
|
||||
IndCallBlock->removeSuccessors(OldSucc.begin(), OldSucc.end());
|
||||
assert(IndCallBlock->succ_empty());
|
||||
|
||||
// Move them to the merge block.
|
||||
MergeBlock->addSuccessors(OldSucc.begin(),
|
||||
OldSucc.end(),
|
||||
BranchInfo.begin(),
|
||||
BranchInfo.end());
|
||||
|
||||
// Update the execution count on the MergeBlock.
|
||||
MergeBlock->setExecutionCount(IndCallBlock->getExecutionCount());
|
||||
}
|
||||
|
||||
// Scale indirect call counts to the execution count of the original
|
||||
// basic block containing the indirect call.
|
||||
uint64_t TotalIndirectBranches = 0;
|
||||
uint64_t TotalIndirectMispreds = 0;
|
||||
for (const auto &BI : Targets) {
|
||||
TotalIndirectBranches += BI.Branches;
|
||||
TotalIndirectMispreds += BI.Mispreds;
|
||||
}
|
||||
|
||||
uint64_t TotalCount = 0;
|
||||
uint64_t TotalMispreds = 0;
|
||||
|
||||
if (Function.hasValidProfile()) {
|
||||
TotalCount = IndCallBlock->getExecutionCount();
|
||||
TotalMispreds =
|
||||
TotalCount * ((double)TotalIndirectMispreds / TotalIndirectBranches);
|
||||
assert(TotalCount != BinaryBasicBlock::COUNT_NO_PROFILE);
|
||||
}
|
||||
|
||||
// New BinaryBranchInfo scaled to the execution count of the original BB.
|
||||
std::vector<BinaryBranchInfo> BBI;
|
||||
for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) {
|
||||
BBI.push_back(
|
||||
BinaryBranchInfo{
|
||||
uint64_t(TotalCount * ((double)Itr->Branches / TotalIndirectBranches)),
|
||||
uint64_t(TotalMispreds * ((double)Itr->Mispreds / TotalIndirectMispreds))
|
||||
}
|
||||
);
|
||||
}
|
||||
auto BI = BBI.begin();
|
||||
auto updateCurrentBranchInfo = [&]{
|
||||
assert(BI < BBI.end());
|
||||
TotalCount -= BI->Count;
|
||||
TotalMispreds -= BI->MispredictedCount;
|
||||
++BI;
|
||||
};
|
||||
|
||||
// Fix up successors and execution counts.
|
||||
updateCurrentBranchInfo();
|
||||
IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // uncond branch
|
||||
IndCallBlock->addSuccessor(NewBBs[0].get(), BBI[0]); // conditional branch
|
||||
|
||||
size_t Adj = 1 + (!IsTailCall ? 1 : 0);
|
||||
for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
|
||||
assert(TotalCount <= IndCallBlock->getExecutionCount() ||
|
||||
TotalCount <= uint64_t(TotalIndirectBranches));
|
||||
uint64_t ExecCount = BBI[(I+1)/2].Count;
|
||||
NewBBs[I]->setCanOutline(IndCallBlock->canOutline());
|
||||
NewBBs[I]->setIsCold(IndCallBlock->isCold());
|
||||
if (I % 2 == 0) {
|
||||
if (MergeBlock) {
|
||||
NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond
|
||||
}
|
||||
} else {
|
||||
assert(I + 2 < NewBBs.size());
|
||||
updateCurrentBranchInfo();
|
||||
NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount); // uncond branch
|
||||
NewBBs[I]->addSuccessor(NewBBs[I+1].get(), BBI[(I+1)/2]); // cond. branch
|
||||
ExecCount += TotalCount;
|
||||
}
|
||||
NewBBs[I]->setExecutionCount(ExecCount);
|
||||
}
|
||||
|
||||
// Arrange for the MergeBlock to be the fallthrough for the first
|
||||
// promoted call block.
|
||||
if (MergeBlock) {
|
||||
MergeBlock->setCanOutline(IndCallBlock->canOutline());
|
||||
MergeBlock->setIsCold(IndCallBlock->isCold());
|
||||
std::unique_ptr<BinaryBasicBlock> MBPtr;
|
||||
std::swap(MBPtr, NewBBs.back());
|
||||
NewBBs.pop_back();
|
||||
NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
|
||||
// TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
|
||||
NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
|
||||
}
|
||||
|
||||
// cold call block
|
||||
// TODO: should be able to outline/cold this block.
|
||||
NewBBs.back()->setExecutionCount(TotalCount);
|
||||
NewBBs.back()->setCanOutline(IndCallBlock->canOutline());
|
||||
NewBBs.back()->setIsCold(IndCallBlock->isCold());
|
||||
|
||||
// update BB and BB layout.
|
||||
Function.insertBasicBlocks(IndCallBlock, std::move(NewBBs));
|
||||
assert(Function.validateCFG());
|
||||
|
||||
return MergeBlock;
|
||||
}
|
||||
|
||||
size_t
|
||||
IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
|
||||
const MCInst &Inst,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
uint64_t NumCalls) {
|
||||
// If we have no targets (or no calls), skip this callsite.
|
||||
if (Targets.empty() || !NumCalls) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
|
||||
<< InstIdx << " in " << BB->getName()
|
||||
<< ", calls = " << NumCalls
|
||||
<< ", targets empty or NumCalls == 0.\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const auto TrialN = std::min(size_t(opts::IndirectCallPromotionTopN),
|
||||
Targets.size());
|
||||
|
||||
if (!opts::ICPFuncsList.empty()) {
|
||||
for (auto &Name : opts::ICPFuncsList) {
|
||||
if (BB->getFunction()->hasName(Name))
|
||||
return TrialN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Pick the top N targets.
|
||||
uint64_t TotalCallsTopN = 0;
|
||||
uint64_t TotalMispredictsTopN = 0;
|
||||
size_t N = 0;
|
||||
|
||||
if (opts::IndirectCallPromotionUseMispredicts) {
|
||||
// Count total number of mispredictions for (at most) the top N targets.
|
||||
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
||||
// is exceeded by fewer targets.
|
||||
double Threshold = double(opts::IndirectCallPromotionMispredictThreshold);
|
||||
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
|
||||
const auto Frequency = (100.0 * Targets[I].Mispreds) / NumCalls;
|
||||
TotalMispredictsTopN += Targets[I].Mispreds;
|
||||
TotalNumFrequentCalls += Targets[I].Branches;
|
||||
Threshold -= Frequency;
|
||||
}
|
||||
|
||||
// Compute the misprediction frequency of the top N call targets. If this
|
||||
// frequency is greater than the threshold, we should try ICP on this callsite.
|
||||
const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
|
||||
|
||||
if (TopNFrequency == 0 ||
|
||||
TopNFrequency < opts::IndirectCallPromotionMispredictThreshold) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
|
||||
<< InstIdx << " in " << BB->getName() << ", calls = "
|
||||
<< NumCalls << ", top N mis. frequency "
|
||||
<< format("%.1f", TopNFrequency) << "% < "
|
||||
<< opts::IndirectCallPromotionMispredictThreshold << "%\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
// Count total number of calls for (at most) the top N targets.
|
||||
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
||||
// is exceeded by fewer targets.
|
||||
double Threshold = double(opts::IndirectCallPromotionThreshold);
|
||||
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
|
||||
const auto Frequency = (100.0 * Targets[I].Branches) / NumCalls;
|
||||
TotalCallsTopN += Targets[I].Branches;
|
||||
TotalMispredictsTopN += Targets[I].Mispreds;
|
||||
TotalNumFrequentCalls += Targets[I].Branches;
|
||||
Threshold -= Frequency;
|
||||
}
|
||||
|
||||
// Compute the frequency of the top N call targets. If this frequency
|
||||
// is greater than the threshold, we should try ICP on this callsite.
|
||||
const double TopNFrequency = (100.0 * TotalCallsTopN) / NumCalls;
|
||||
|
||||
if (TopNFrequency == 0 ||
|
||||
TopNFrequency < opts::IndirectCallPromotionThreshold) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
|
||||
<< InstIdx << " in " << BB->getName() << ", calls = "
|
||||
<< NumCalls << ", top N frequency "
|
||||
<< format("%.1f", TopNFrequency) << "% < "
|
||||
<< opts::IndirectCallPromotionThreshold << "%\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Compute the misprediction frequency of the top N call targets. If
|
||||
// this frequency is less than the threshold, we should skip ICP at
|
||||
// this callsite.
|
||||
const double TopNMispredictFrequency =
|
||||
(100.0 * TotalMispredictsTopN) / NumCalls;
|
||||
|
||||
if (TopNMispredictFrequency <
|
||||
opts::IndirectCallPromotionMispredictThreshold) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
|
||||
<< InstIdx << " in " << BB->getName() << ", calls = "
|
||||
<< NumCalls << ", top N mispredict frequency "
|
||||
<< format("%.1f", TopNMispredictFrequency) << "% < "
|
||||
<< opts::IndirectCallPromotionMispredictThreshold << "%\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
void
|
||||
IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
|
||||
const MCInst &Inst,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
const size_t N,
|
||||
uint64_t NumCalls) const {
|
||||
auto &BC = BB->getFunction()->getBinaryContext();
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
bool Separator = false;
|
||||
|
||||
outs() << "BOLT-INFO: ICP candidate branch info: "
|
||||
<< *BB->getFunction() << " @ " << InstIdx
|
||||
<< " in " << BB->getName()
|
||||
<< " -> calls = " << NumCalls
|
||||
<< (BC.MIA->isTailCall(Inst) ? " (tail)" : "");
|
||||
for (size_t I = 0; I < N; I++) {
|
||||
const auto Frequency = 100.0 * Targets[I].Branches / NumCalls;
|
||||
const auto MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
|
||||
outs() << (Separator ? " | " : ", ");
|
||||
Separator = true;
|
||||
outs() << Targets[I].To.Name
|
||||
<< ", calls = " << Targets[I].Branches
|
||||
<< ", mispreds = " << Targets[I].Mispreds
|
||||
<< ", taken freq = " << format("%.1f", Frequency) << "%"
|
||||
<< ", mis. freq = " << format("%.1f", MisFrequency) << "%";
|
||||
}
|
||||
outs() << "\n";
|
||||
|
||||
DEBUG({
|
||||
dbgs() << "BOLT-INFO: ICP original call instruction:\n";
|
||||
BC.printInstruction(dbgs(), Inst, Targets[0].From.Offset, nullptr, true);
|
||||
});
|
||||
}
|
||||
|
||||
void IndirectCallPromotion::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions
|
||||
) {
|
||||
for (auto &BFIt : BFs) {
|
||||
auto &Function = BFIt.second;
|
||||
|
||||
if (!Function.isSimple() || !opts::shouldProcess(Function))
|
||||
continue;
|
||||
|
||||
const auto BranchDataOrErr = BC.DR.getFuncBranchData(Function.getNames());
|
||||
if (const auto EC = BranchDataOrErr.getError()) {
|
||||
DEBUG(dbgs() << "BOLT-INFO: no branch data found for \""
|
||||
<< Function << "\"\n");
|
||||
continue;
|
||||
}
|
||||
const FuncBranchData &BranchData = BranchDataOrErr.get();
|
||||
const bool HasLayout = !Function.layout_empty();
|
||||
|
||||
// Note: this is not just counting calls.
|
||||
TotalCalls += BranchData.ExecutionCount;
|
||||
|
||||
// Total number of indirect calls issued from the current Function.
|
||||
// (a fraction of TotalIndirectCalls)
|
||||
uint64_t FuncTotalIndirectCalls = 0;
|
||||
|
||||
std::vector<BinaryBasicBlock *> BBs;
|
||||
for (auto &BB : Function) {
|
||||
// Skip indirect calls in cold blocks.
|
||||
if (!HasLayout || !Function.isSplit() || !BB.isCold()) {
|
||||
BBs.push_back(&BB);
|
||||
}
|
||||
}
|
||||
|
||||
while (!BBs.empty()) {
|
||||
auto *BB = BBs.back();
|
||||
BBs.pop_back();
|
||||
|
||||
for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
|
||||
auto &Inst = BB->getInstructionAtIndex(Idx);
|
||||
const auto InstIdx = &Inst - &(*BB->begin());
|
||||
|
||||
if (!BC.MIA->hasAnnotation(Inst, "IndirectBranchData"))
|
||||
continue;
|
||||
|
||||
assert(BC.MIA->isCall(Inst));
|
||||
|
||||
++TotalIndirectCallsites;
|
||||
|
||||
const auto Targets = getCallTargets(BC, BranchData, Inst);
|
||||
|
||||
// Compute the total number of calls from this particular callsite.
|
||||
uint64_t NumCalls = 0;
|
||||
for (const auto &BInfo : Targets) {
|
||||
NumCalls += BInfo.Branches;
|
||||
}
|
||||
FuncTotalIndirectCalls += NumCalls;
|
||||
|
||||
// Should this callsite be optimized? Return the number of targets
|
||||
// to use when promoting this call. A value of zero means to skip
|
||||
// this callsite.
|
||||
size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls);
|
||||
|
||||
if (!N)
|
||||
continue;
|
||||
|
||||
if (opts::Verbosity >= 1) {
|
||||
printCallsiteInfo(BB, Inst, Targets, N, NumCalls);
|
||||
}
|
||||
|
||||
// Find MCSymbols or absolute addresses for each call target.
|
||||
const auto SymTargets = findCallTargetSymbols(BC, Targets, N);
|
||||
|
||||
// If we can't resolve any of the target symbols, punt on this callsite.
|
||||
if (SymTargets.size() < N) {
|
||||
const auto LastTarget = SymTargets.size();
|
||||
if (opts::Verbosity >= 1) {
|
||||
outs() << "BOLT-INFO: ICP failed to find target symbol for "
|
||||
<< Targets[LastTarget].To.Name << " in "
|
||||
<< Function << " @ " << InstIdx << " in "
|
||||
<< BB->getName() << ", calls = " << NumCalls << "\n";
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Generate new promoted call code for this callsite.
|
||||
auto ICPcode =
|
||||
BC.MIA->indirectCallPromotion(Inst,
|
||||
SymTargets,
|
||||
opts::ICPOldCodeSequence,
|
||||
BC.Ctx.get());
|
||||
|
||||
if (ICPcode.empty()) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
||||
<< InstIdx << " in " << BB->getName()
|
||||
<< ", calls = " << NumCalls
|
||||
<< ", unable to generate promoted call code.\n";
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
DEBUG({
|
||||
auto Offset = Targets[0].From.Offset;
|
||||
dbgs() << "BOLT-INFO: ICP indirect call code:\n";
|
||||
for (const auto &entry : ICPcode) {
|
||||
const auto &Sym = entry.first;
|
||||
const auto &Insts = entry.second;
|
||||
if (Sym) dbgs() << Sym->getName() << ":\n";
|
||||
Offset = BC.printInstructions(dbgs(),
|
||||
Insts.begin(),
|
||||
Insts.end(),
|
||||
Offset);
|
||||
}
|
||||
dbgs() << "---------------------------------------------------\n";
|
||||
});
|
||||
|
||||
// Rewrite the CFG with the newly generated ICP code.
|
||||
const bool IsTailCall = BC.MIA->isTailCall(Inst);
|
||||
auto NewBBs = rewriteCall(BC, Function, BB, Inst, std::move(ICPcode));
|
||||
|
||||
// Fix the CFG after inserting the new basic blocks.
|
||||
auto MergeBlock = fixCFG(BC, Function, BB, IsTailCall,
|
||||
std::move(NewBBs), Targets);
|
||||
|
||||
// Since the tail of the original block was split off and it may contain
|
||||
// additional indirect calls, we must add the merge block to the set of
|
||||
// blocks to process.
|
||||
if (MergeBlock) {
|
||||
BBs.push_back(MergeBlock);
|
||||
}
|
||||
|
||||
if (opts::Verbosity >= 1) {
|
||||
outs() << "BOLT-INFO: ICP succeeded in "
|
||||
<< Function << " @ " << InstIdx
|
||||
<< " in " << BB->getName()
|
||||
<< " -> calls = " << NumCalls << "\n";
|
||||
}
|
||||
|
||||
++TotalOptimizedIndirectCallsites;
|
||||
|
||||
Modified.insert(&Function);
|
||||
}
|
||||
}
|
||||
TotalIndirectCalls += FuncTotalIndirectCalls;
|
||||
}
|
||||
|
||||
outs() << "BOLT-INFO: ICP total indirect callsites = "
|
||||
<< TotalIndirectCallsites
|
||||
<< "\n"
|
||||
<< "BOLT-INFO: ICP total number of calls = "
|
||||
<< TotalCalls
|
||||
<< "\n"
|
||||
<< "BOLT-INFO: ICP percentage of calls that are indirect = "
|
||||
<< format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls)
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP percentage of indirect calls that can be optimized = "
|
||||
<< format("%.1f", (100.0 * TotalNumFrequentCalls) / TotalIndirectCalls)
|
||||
<< "%\n"
|
||||
<< "BOLT-INFO: ICP percentage of indirect calls that are optimized = "
|
||||
<< format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
|
||||
TotalIndirectCallsites)
|
||||
<< "%\n";
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
|
|
@ -395,6 +395,154 @@ class PrintSortedBy : public BinaryFunctionPass {
|
|||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Optimize indirect calls.
|
||||
/// The indirect call promotion pass visits each indirect call and
|
||||
/// examines the BranchData for each. If the most frequent targets
|
||||
/// from that callsite exceed the specified threshold (default 90%),
|
||||
/// the call is promoted. Otherwise, it is ignored. By default,
|
||||
/// only one target is considered at each callsite.
|
||||
///
|
||||
/// When an candidate callsite is processed, we modify the callsite
|
||||
/// to test for the most common call targets before calling through
|
||||
/// the original generic call mechanism.
|
||||
///
|
||||
/// The CFG and layout are modified by ICP.
|
||||
///
|
||||
/// A few new command line options have been added:
|
||||
/// -indirect-call-promotion
|
||||
/// -indirect-call-promotion-threshold=<percentage>
|
||||
/// -indirect-call-promotion-mispredict-threshold=<percentage>
|
||||
/// -indirect-call-promotion-topn=<int>
|
||||
///
|
||||
/// The threshold is the minimum frequency of a call target needed
|
||||
/// before ICP is triggered.
|
||||
///
|
||||
/// The mispredict threshold is used to disable the optimization at
|
||||
/// any callsite where the branch predictor does a good enough job
|
||||
/// that ICP wouldn't help regardless of the frequency of the most
|
||||
/// common target.
|
||||
///
|
||||
/// The topn option controls the number of targets to consider for
|
||||
/// each callsite, e.g. ICP is triggered if topn=2 and the total
|
||||
/// frequency of the top two call targets exceeds the threshold.
|
||||
///
|
||||
/// The minimize code size option controls whether or not the hot
|
||||
/// calls are to registers (callq %r10) or to function addresses
|
||||
/// (callq $foo).
|
||||
///
|
||||
/// Example of ICP:
|
||||
///
|
||||
/// C++ code:
|
||||
///
|
||||
/// int B_count = 0;
|
||||
/// int C_count = 0;
|
||||
///
|
||||
/// struct A { virtual void foo() = 0; }
|
||||
/// struct B : public A { virtual void foo() { ++B_count; }; };
|
||||
/// struct C : public A { virtual void foo() { ++C_count; }; };
|
||||
///
|
||||
/// A* a = ...
|
||||
/// a->foo();
|
||||
/// ...
|
||||
///
|
||||
/// original assembly:
|
||||
///
|
||||
/// B0: 49 8b 07 mov (%r15),%rax
|
||||
/// 4c 89 ff mov %r15,%rdi
|
||||
/// ff 10 callq *(%rax)
|
||||
/// 41 83 e6 01 and $0x1,%r14d
|
||||
/// 4d 89 e6 mov %r12,%r14
|
||||
/// 4c 0f 44 f5 cmove %rbp,%r14
|
||||
/// 4c 89 f7 mov %r14,%rdi
|
||||
/// ...
|
||||
///
|
||||
/// after ICP:
|
||||
///
|
||||
/// B0: 49 8b 07 mov (%r15),%rax
|
||||
/// 4c 89 ff mov %r15,%rdi
|
||||
/// 48 81 38 e0 0b 40 00 cmpq $B::foo,(%rax)
|
||||
/// 75 29 jne B3
|
||||
/// B1: e8 45 03 00 00 callq $B::foo
|
||||
/// B2: 41 83 e6 01 and $0x1,%r14d
|
||||
/// 4d 89 e6 mov %r12,%r14
|
||||
/// 4c 0f 44 f5 cmove %rbp,%r14
|
||||
/// 4c 89 f7 mov %r14,%rdi
|
||||
/// ...
|
||||
///
|
||||
/// B3: ff 10 callq *(%rax)
|
||||
/// eb d6 jmp B2
|
||||
///
|
||||
class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
using BasicBlocksVector = std::vector<std::unique_ptr<BinaryBasicBlock>>;
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
// Total number of calls from all callsites.
|
||||
uint64_t TotalCalls{0};
|
||||
|
||||
// Total number of indirect calls from all callsites.
|
||||
// (a fraction of TotalCalls)
|
||||
uint64_t TotalIndirectCalls{0};
|
||||
|
||||
// Total number of callsites that use indirect calls.
|
||||
// (the total number of callsites is not recorded)
|
||||
uint64_t TotalIndirectCallsites{0};
|
||||
|
||||
// Total number of indirect callsites that are optimized by ICP.
|
||||
// (a fraction of TotalIndirectCallsites)
|
||||
uint64_t TotalOptimizedIndirectCallsites{0};
|
||||
|
||||
// Total number of indirect calls that are optimized by ICP.
|
||||
// (a fraction of TotalCalls)
|
||||
uint64_t TotalNumFrequentCalls{0};
|
||||
|
||||
std::vector<BranchInfo> getCallTargets(BinaryContext &BC,
|
||||
const FuncBranchData &BranchData,
|
||||
const MCInst &Inst) const;
|
||||
|
||||
size_t canPromoteCallsite(const BinaryBasicBlock *BB,
|
||||
const MCInst &Inst,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
uint64_t NumCalls);
|
||||
|
||||
void printCallsiteInfo(const BinaryBasicBlock *BB,
|
||||
const MCInst &Inst,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
const size_t N,
|
||||
uint64_t NumCalls) const;
|
||||
|
||||
std::vector<std::pair<MCSymbol *, uint64_t>>
|
||||
findCallTargetSymbols(BinaryContext &BC,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
const size_t N) const;
|
||||
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>>
|
||||
rewriteCall(BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const MCInst &CallInst,
|
||||
MCInstrAnalysis::ICPdata &&ICPcode) const;
|
||||
|
||||
BinaryBasicBlock *fixCFG(BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const bool IsTailCall,
|
||||
BasicBlocksVector &&NewBBs,
|
||||
const std::vector<BranchInfo> &Targets) const;
|
||||
|
||||
public:
|
||||
explicit IndirectCallPromotion(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const {
|
||||
return "indirect-call-promotion";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
|
|
|
@ -19,6 +19,21 @@
|
|||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
iterator_range<FuncBranchData::ContainerTy::const_iterator>
|
||||
FuncBranchData::getBranchRange(uint64_t From) const {
|
||||
assert(std::is_sorted(Data.begin(), Data.end()));
|
||||
struct Compare {
|
||||
bool operator()(const BranchInfo &BI, const uint64_t Val) const {
|
||||
return BI.From.Offset < Val;
|
||||
}
|
||||
bool operator()(const uint64_t Val, const BranchInfo &BI) const {
|
||||
return Val < BI.From.Offset;
|
||||
}
|
||||
};
|
||||
auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare());
|
||||
return iterator_range<ContainerTy::const_iterator>(Range.first, Range.second);
|
||||
}
|
||||
|
||||
void BranchInfo::mergeWith(const BranchInfo &BI) {
|
||||
|
||||
// Merge branch and misprediction counts.
|
||||
|
|
|
@ -122,6 +122,10 @@ struct FuncBranchData {
|
|||
/// returned. If the offset corresponds to an indirect call the behavior is
|
||||
/// undefined.
|
||||
ErrorOr<const BranchInfo &> getDirectCallBranch(uint64_t From) const;
|
||||
|
||||
/// Find all the branches originating at From.
|
||||
iterator_range<ContainerTy::const_iterator> getBranchRange(
|
||||
uint64_t From) const;
|
||||
};
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -126,6 +126,14 @@ DynoStatsAll("dyno-stats-all", cl::desc("print dyno stats after each stage"),
|
|||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<unsigned>
|
||||
TopCalledLimit("top-called-limit",
|
||||
cl::desc("maximum number of functions to print in top called "
|
||||
"functions section"),
|
||||
cl::init(100),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
cl::opt<bool>
|
||||
HotText("hot-text",
|
||||
cl::desc("hot text symbols support"),
|
||||
|
@ -724,11 +732,9 @@ void RewriteInstance::run() {
|
|||
auto FunctionIt = BinaryFunctions.find(Address);
|
||||
assert(FunctionIt != BinaryFunctions.end() &&
|
||||
"Invalid large function address.");
|
||||
if (opts::Verbosity >= 1) {
|
||||
errs() << "BOLT-WARNING: Function " << FunctionIt->second
|
||||
<< " is larger than its orginal size: emitting again marking it "
|
||||
<< "as not simple.\n";
|
||||
}
|
||||
errs() << "BOLT-WARNING: Function " << FunctionIt->second
|
||||
<< " is larger than its orginal size: emitting again marking it "
|
||||
<< "as not simple.\n";
|
||||
FunctionIt->second.setSimple(false);
|
||||
}
|
||||
|
||||
|
@ -1694,7 +1700,8 @@ void RewriteInstance::disassembleFunctions() {
|
|||
}
|
||||
);
|
||||
auto SFI = ProfiledFunctions.begin();
|
||||
for (int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) {
|
||||
auto SFIend = ProfiledFunctions.end();
|
||||
for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) {
|
||||
outs() << " " << **SFI << " : "
|
||||
<< (*SFI)->getExecutionCount() << '\n';
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue