[BOLT] Move BOLT passes under Passes subdirectory (NFC).

Summary:
Move passes under Passes subdirectory.

Move inlining passes under Passes/Inliner.*

(cherry picked from FBD4575832)
This commit is contained in:
Maksim Panchenko 2017-02-16 14:57:57 -08:00
parent f06a1455ea
commit 88244a10bb
13 changed files with 3767 additions and 11 deletions

View File

@ -12,8 +12,8 @@
#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "ReorderAlgorithm.h"
#include "DataReader.h"
#include "Passes/ReorderAlgorithm.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
#include "llvm/MC/MCAsmInfo.h"

View File

@ -10,7 +10,8 @@
//===----------------------------------------------------------------------===//
#include "BinaryPassManager.h"
#include "FrameOptimizerPass.h"
#include "Passes/FrameOptimizer.h"
#include "Passes/Inliner.h"
#include "llvm/Support/Timer.h"
using namespace llvm;

View File

@ -15,7 +15,7 @@
#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H
#include "BinaryFunction.h"
#include "BinaryPasses.h"
#include "Passes/BinaryPasses.h"
#include "llvm/Support/Options.h"
#include "llvm/Support/CommandLine.h"
#include <map>

View File

@ -1,7 +1,9 @@
add_subdirectory(merge-fdata)
add_subdirectory(Passes)
set(LLVM_LINK_COMPONENTS
${LLVM_TARGETS_TO_BUILD}
BOLTPasses
CodeGen
Core
DebugInfoDWARF
@ -18,13 +20,10 @@ add_llvm_tool(llvm-bolt
BinaryBasicBlock.cpp
BinaryContext.cpp
BinaryFunction.cpp
BinaryPasses.cpp
BinaryPassManager.cpp
DataReader.cpp
DebugData.cpp
Exceptions.cpp
FrameOptimizerPass.cpp
RewriteInstance.cpp
ReorderAlgorithm.cpp
DWARFRewriter.cpp
)

1581
bolt/Passes/BinaryPasses.cpp Normal file

File diff suppressed because it is too large Load Diff

490
bolt/Passes/BinaryPasses.h Normal file
View File

@ -0,0 +1,490 @@
//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// The set of optimization/analysis passes that run on BinaryFunctions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "llvm/Support/CommandLine.h"
#include <map>
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
namespace llvm {
namespace bolt {
/// An optimization/analysis pass that runs on functions.
class BinaryFunctionPass {
const cl::opt<bool> &PrintPass;
protected:
explicit BinaryFunctionPass(const cl::opt<bool> &PrintPass)
: PrintPass(PrintPass) { }
/// Control whether a specific function should be skipped during
/// optimization.
bool shouldOptimize(const BinaryFunction &BF) const;
public:
virtual ~BinaryFunctionPass() = default;
/// The name of this pass
virtual const char *getName() const = 0;
/// Control whether debug info is printed after this pass is completed.
bool printPass() const { return PrintPass; }
/// Control whether debug info is printed for an individual function after
/// this pass is completed (printPass() must have returned true).
virtual bool shouldPrint(const BinaryFunction &BF) const;
/// Execute this pass on the given functions.
virtual void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) = 0;
};
/// Detects functions that simply do a tail call when they are called and
/// optimizes calls to these functions.
class OptimizeBodylessFunctions : public BinaryFunctionPass {
private:
/// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G,
/// thus calls to F can be optimized to calls to G.
std::unordered_map<const MCSymbol *, const BinaryFunction *>
EquivalentCallTarget;
void analyze(BinaryFunction &BF,
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs);
void optimizeCalls(BinaryFunction &BF,
BinaryContext &BC);
/// Stats for eliminated calls.
uint64_t NumEliminatedCalls{0};
uint64_t NumOptimizedCallSites{0};
public:
explicit OptimizeBodylessFunctions(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "optimize-bodyless";
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Detect and eliminate unreachable basic blocks. We could have those
/// filled with nops and they are used for alignment.
class EliminateUnreachableBlocks : public BinaryFunctionPass {
std::unordered_set<const BinaryFunction *> Modified;
unsigned DeletedBlocks{0};
uint64_t DeletedBytes{0};
void runOnFunction(BinaryFunction& Function);
public:
EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "eliminate-unreachable";
}
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext&,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
// Reorder the basic blocks for each function based on hotness.
class ReorderBasicBlocks : public BinaryFunctionPass {
public:
explicit ReorderBasicBlocks(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "reordering";
}
bool shouldPrint(const BinaryFunction &BF) const override;
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Sync local branches with CFG.
class FixupBranches : public BinaryFunctionPass {
public:
explicit FixupBranches(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "fix-branches";
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Fix the CFI state and exception handling information after all other
/// passes have completed.
class FixupFunctions : public BinaryFunctionPass {
public:
explicit FixupFunctions(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "fixup-functions";
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// An optimization to simplify conditional tail calls by removing
/// unnecessary branches.
///
/// This optimization considers both of the following cases:
///
/// foo: ...
/// jcc L1 original
/// ...
/// L1: jmp bar # TAILJMP
///
/// ->
///
/// foo: ...
/// jcc bar iff jcc L1 is expected
/// ...
///
/// L1 is unreachable
///
/// OR
///
/// foo: ...
/// jcc L2
/// L1: jmp dest # TAILJMP
/// L2: ...
///
/// ->
///
/// foo: jncc dest # TAILJMP
/// L2: ...
///
/// L1 is unreachable
///
/// For this particular case, the first basic block ends with
/// a conditional branch and has two successors, one fall-through
/// and one for when the condition is true.
/// The target of the conditional is a basic block with a single
/// unconditional branch (i.e. tail call) to another function.
/// We don't care about the contents of the fall-through block.
/// We assume that the target of the conditional branch is the
/// first successor.
class SimplifyConditionalTailCalls : public BinaryFunctionPass {
uint64_t NumCandidateTailCalls{0};
uint64_t NumTailCallsPatched{0};
uint64_t NumOrigForwardBranches{0};
uint64_t NumOrigBackwardBranches{0};
std::unordered_set<const BinaryFunction *> Modified;
bool shouldRewriteBranch(const BinaryBasicBlock *PredBB,
const MCInst &CondBranch,
const BinaryBasicBlock *BB,
const bool DirectionFlag);
uint64_t fixTailCalls(BinaryContext &BC, BinaryFunction &BF);
public:
explicit SimplifyConditionalTailCalls(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "simplify-conditional-tail-calls";
}
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Perform simple peephole optimizations.
class Peepholes : public BinaryFunctionPass {
uint64_t NumDoubleJumps{0};
uint64_t TailCallTraps{0};
/// Attempt to use the minimum operand width for arithmetic, branch and
/// move instructions.
void shortenInstructions(BinaryContext &BC, BinaryFunction &Function);
/// Replace double jumps with a jump directly to the target, i.e.
/// jmp/jcc L1; L1: jmp L2 -> jmp/jcc L2.
void fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function);
/// Add trap instructions immediately after indirect tail calls to prevent
/// the processor from decoding instructions immediate following the
/// tailcall.
void addTailcallTraps(BinaryContext &BC, BinaryFunction &Function);
public:
explicit Peepholes(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "peepholes";
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// An optimization to simplify loads from read-only sections.The pass converts
/// load instructions with statically computed target address such as:
///
/// mov 0x12f(%rip), %eax
///
/// to their counterparts that use immediate opreands instead of memory loads:
///
/// mov $0x4007dc, %eax
///
/// when the target address points somewhere inside a read-only section.
///
class SimplifyRODataLoads : public BinaryFunctionPass {
uint64_t NumLoadsSimplified{0};
uint64_t NumDynamicLoadsSimplified{0};
uint64_t NumLoadsFound{0};
uint64_t NumDynamicLoadsFound{0};
std::unordered_set<const BinaryFunction *> Modified;
bool simplifyRODataLoads(BinaryContext &BC, BinaryFunction &BF);
public:
explicit SimplifyRODataLoads(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "simplify-read-only-loads";
}
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// An optimization that replaces references to identical functions with
/// references to a single one of them.
///
class IdenticalCodeFolding : public BinaryFunctionPass {
public:
explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "identical-code-folding";
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
///
/// Prints a list of the top 100 functions sorted by a set of
/// dyno stats categories.
///
class PrintSortedBy : public BinaryFunctionPass {
public:
explicit PrintSortedBy(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "print-sorted-by";
}
bool shouldPrint(const BinaryFunction &) const override {
return false;
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Optimize indirect calls.
/// The indirect call promotion pass visits each indirect call and
/// examines the BranchData for each. If the most frequent targets
/// from that callsite exceed the specified threshold (default 90%),
/// the call is promoted. Otherwise, it is ignored. By default,
/// only one target is considered at each callsite.
///
/// When an candidate callsite is processed, we modify the callsite
/// to test for the most common call targets before calling through
/// the original generic call mechanism.
///
/// The CFG and layout are modified by ICP.
///
/// A few new command line options have been added:
/// -indirect-call-promotion
/// -indirect-call-promotion-threshold=<percentage>
/// -indirect-call-promotion-mispredict-threshold=<percentage>
/// -indirect-call-promotion-topn=<int>
///
/// The threshold is the minimum frequency of a call target needed
/// before ICP is triggered.
///
/// The mispredict threshold is used to disable the optimization at
/// any callsite where the branch predictor does a good enough job
/// that ICP wouldn't help regardless of the frequency of the most
/// common target.
///
/// The topn option controls the number of targets to consider for
/// each callsite, e.g. ICP is triggered if topn=2 and the total
/// frequency of the top two call targets exceeds the threshold.
///
/// The minimize code size option controls whether or not the hot
/// calls are to registers (callq %r10) or to function addresses
/// (callq $foo).
///
/// Example of ICP:
///
/// C++ code:
///
/// int B_count = 0;
/// int C_count = 0;
///
/// struct A { virtual void foo() = 0; }
/// struct B : public A { virtual void foo() { ++B_count; }; };
/// struct C : public A { virtual void foo() { ++C_count; }; };
///
/// A* a = ...
/// a->foo();
/// ...
///
/// original assembly:
///
/// B0: 49 8b 07 mov (%r15),%rax
/// 4c 89 ff mov %r15,%rdi
/// ff 10 callq *(%rax)
/// 41 83 e6 01 and $0x1,%r14d
/// 4d 89 e6 mov %r12,%r14
/// 4c 0f 44 f5 cmove %rbp,%r14
/// 4c 89 f7 mov %r14,%rdi
/// ...
///
/// after ICP:
///
/// B0: 49 8b 07 mov (%r15),%rax
/// 4c 89 ff mov %r15,%rdi
/// 48 81 38 e0 0b 40 00 cmpq $B::foo,(%rax)
/// 75 29 jne B3
/// B1: e8 45 03 00 00 callq $B::foo
/// B2: 41 83 e6 01 and $0x1,%r14d
/// 4d 89 e6 mov %r12,%r14
/// 4c 0f 44 f5 cmove %rbp,%r14
/// 4c 89 f7 mov %r14,%rdi
/// ...
///
/// B3: ff 10 callq *(%rax)
/// eb d6 jmp B2
///
class IndirectCallPromotion : public BinaryFunctionPass {
using BasicBlocksVector = std::vector<std::unique_ptr<BinaryBasicBlock>>;
std::unordered_set<const BinaryFunction *> Modified;
// Total number of calls from all callsites.
uint64_t TotalCalls{0};
// Total number of indirect calls from all callsites.
// (a fraction of TotalCalls)
uint64_t TotalIndirectCalls{0};
// Total number of callsites that use indirect calls.
// (the total number of callsites is not recorded)
uint64_t TotalIndirectCallsites{0};
// Total number of indirect callsites that are optimized by ICP.
// (a fraction of TotalIndirectCallsites)
uint64_t TotalOptimizedIndirectCallsites{0};
// Total number of indirect calls that are optimized by ICP.
// (a fraction of TotalCalls)
uint64_t TotalNumFrequentCalls{0};
std::vector<BranchInfo> getCallTargets(BinaryContext &BC,
const FuncBranchData &BranchData,
const MCInst &Inst) const;
size_t canPromoteCallsite(const BinaryBasicBlock *BB,
const MCInst &Inst,
const std::vector<BranchInfo> &Targets,
uint64_t NumCalls);
void printCallsiteInfo(const BinaryBasicBlock *BB,
const MCInst &Inst,
const std::vector<BranchInfo> &Targets,
const size_t N,
uint64_t NumCalls) const;
std::vector<std::pair<MCSymbol *, uint64_t>>
findCallTargetSymbols(BinaryContext &BC,
const std::vector<BranchInfo> &Targets,
const size_t N) const;
std::vector<std::unique_ptr<BinaryBasicBlock>>
rewriteCall(BinaryContext &BC,
BinaryFunction &Function,
BinaryBasicBlock *IndCallBlock,
const MCInst &CallInst,
MCInstrAnalysis::ICPdata &&ICPcode) const;
BinaryBasicBlock *fixCFG(BinaryContext &BC,
BinaryFunction &Function,
BinaryBasicBlock *IndCallBlock,
const bool IsTailCall,
BasicBlocksVector &&NewBBs,
const std::vector<BranchInfo> &Targets) const;
public:
explicit IndirectCallPromotion(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const {
return "indirect-call-promotion";
}
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
/// Pass for lowering any instructions that we have raised and that have
/// to be lowered.
class InstructionLowering : public BinaryFunctionPass {
public:
explicit InstructionLowering(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}
const char *getName() const override {
return "inst-lowering";
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt
} // namespace llvm
#endif

View File

@ -0,0 +1,8 @@
add_llvm_library(LLVMBOLTPasses
BinaryPasses.cpp
FrameOptimizer.cpp
Inliner.cpp
ReorderAlgorithm.cpp
)
include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt )

View File

@ -1,4 +1,4 @@
//===--- FrameOptimizerPass.cpp -------------------------------------------===//
//===--- Passes/FrameOptimizer.cpp ----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
#include "FrameOptimizerPass.h"
#include "FrameOptimizer.h"
#include <queue>
#include <unordered_map>

View File

@ -1,4 +1,4 @@
//===--- FrameOptimizerPass.h ---------------------------------------------===//
//===--- Passes/FrameOptimizer.h ------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@ -9,8 +9,8 @@
//
//===----------------------------------------------------------------------===//
#ifndef FRAMEOPTIMIZERPASS_H
#define FRAMEOPTIMIZERPASS_H
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
#include "BinaryPasses.h"

609
bolt/Passes/Inliner.cpp Normal file
View File

@ -0,0 +1,609 @@
//===--- Passes/Inliner.cpp - Inlining infra for BOLT ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "Inliner.h"
#include "llvm/Support/Options.h"
#define DEBUG_TYPE "bolt-inliner"
using namespace llvm;
namespace opts {
static cl::list<std::string>
ForceInlineFunctions("force-inline",
cl::CommaSeparated,
cl::desc("list of functions to always consider "
"for inlining"),
cl::value_desc("func1,func2,func3,..."),
cl::Hidden);
static cl::opt<bool>
AggressiveInlining("aggressive-inlining",
cl::desc("perform aggressive inlining"),
cl::ZeroOrMore,
cl::Hidden);
}
namespace llvm {
namespace bolt {
void InlineSmallFunctions::findInliningCandidates(
BinaryContext &BC,
const std::map<uint64_t, BinaryFunction> &BFs) {
for (const auto &BFIt : BFs) {
const auto &Function = BFIt.second;
if (!shouldOptimize(Function) || Function.size() != 1)
continue;
auto &BB = *Function.begin();
const auto &LastInstruction = *BB.rbegin();
// Check if the function is small enough, doesn't do a tail call
// and doesn't throw exceptions.
if (BB.size() > 0 &&
BB.getNumNonPseudos() <= kMaxInstructions &&
BB.lp_empty() &&
BC.MIA->isReturn(LastInstruction) &&
!BC.MIA->isTailCall(LastInstruction)) {
InliningCandidates.insert(&Function);
}
}
DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size()
<< " inlineable functions.\n");
}
void InlineSmallFunctions::findInliningCandidatesAggressive(
BinaryContext &BC,
const std::map<uint64_t, BinaryFunction> &BFs) {
std::set<std::string> OverwrittenFunctions = {
"_ZN4HPHP13hash_string_iEPKcj",
"_ZN4HPHP21hash_string_cs_unsafeEPKcj",
"_ZN4HPHP14hash_string_csEPKcj",
"_ZN4HPHP20hash_string_i_unsafeEPKcj",
"_ZNK4HPHP10StringData10hashHelperEv"
};
for (const auto &BFIt : BFs) {
const auto &Function = BFIt.second;
if (!shouldOptimize(Function) ||
OverwrittenFunctions.count(Function.getSymbol()->getName()) ||
Function.hasEHRanges())
continue;
uint64_t FunctionSize = 0;
for (const auto *BB : Function.layout()) {
FunctionSize += BC.computeCodeSize(BB->begin(), BB->end());
}
assert(FunctionSize > 0 && "found empty function");
if (FunctionSize > kMaxSize)
continue;
bool FoundCFI = false;
for (const auto BB : Function.layout()) {
for (const auto &Inst : *BB) {
if (BC.MIA->isEHLabel(Inst) || BC.MIA->isCFI(Inst)) {
FoundCFI = true;
break;
}
}
}
if (!FoundCFI)
InliningCandidates.insert(&Function);
}
DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size()
<< " inlineable functions.\n");
}
namespace {
/// Returns whether a function creates a stack frame for itself or not.
/// If so, we need to manipulate the stack pointer when calling this function.
/// Since we're only inlining very small functions, we return false for now, but
/// we could for instance check if the function starts with 'push ebp'.
/// TODO generalize this.
bool createsStackFrame(const BinaryBasicBlock &) {
return false;
}
} // namespace
void InlineSmallFunctions::inlineCall(
BinaryContext &BC,
BinaryBasicBlock &BB,
MCInst *CallInst,
const BinaryBasicBlock &InlinedFunctionBB) {
assert(BC.MIA->isCall(*CallInst) && "Can only inline a call.");
assert(BC.MIA->isReturn(*InlinedFunctionBB.rbegin()) &&
"Inlined function should end with a return.");
std::vector<MCInst> InlinedInstance;
bool ShouldAdjustStack = createsStackFrame(InlinedFunctionBB);
// Move stack like 'call' would if needed.
if (ShouldAdjustStack) {
MCInst StackInc;
BC.MIA->createStackPointerIncrement(StackInc);
InlinedInstance.push_back(StackInc);
}
for (auto Instruction : InlinedFunctionBB) {
if (BC.MIA->isReturn(Instruction)) {
break;
}
if (!BC.MIA->isEHLabel(Instruction) &&
!BC.MIA->isCFI(Instruction)) {
InlinedInstance.push_back(Instruction);
}
}
// Move stack pointer like 'ret' would.
if (ShouldAdjustStack) {
MCInst StackDec;
BC.MIA->createStackPointerDecrement(StackDec);
InlinedInstance.push_back(StackDec);
}
BB.replaceInstruction(CallInst, InlinedInstance);
}
std::pair<BinaryBasicBlock *, unsigned>
InlineSmallFunctions::inlineCall(
BinaryContext &BC,
BinaryFunction &CallerFunction,
BinaryBasicBlock *CallerBB,
const unsigned CallInstIndex,
const BinaryFunction &InlinedFunction) {
// Get the instruction to be replaced with inlined code.
MCInst &CallInst = CallerBB->getInstructionAtIndex(CallInstIndex);
assert(BC.MIA->isCall(CallInst) && "Can only inline a call.");
// Point in the function after the inlined code.
BinaryBasicBlock *AfterInlinedBB = nullptr;
unsigned AfterInlinedIstrIndex = 0;
// In case of a tail call we should not remove any ret instructions from the
// inlined instance.
bool IsTailCall = BC.MIA->isTailCall(CallInst);
// The first block of the function to be inlined can be merged with the caller
// basic block. This cannot happen if there are jumps to the first block.
bool CanMergeFirstInlinedBlock = (*InlinedFunction.begin()).pred_size() == 0;
// If the call to be inlined is not at the end of its basic block and we have
// to inline more than one basic blocks (or even just one basic block that
// cannot be merged into the caller block), then the caller's basic block
// should be split.
bool ShouldSplitCallerBB =
CallInstIndex < CallerBB->size() - 1 &&
(InlinedFunction.size() > 1 || !CanMergeFirstInlinedBlock);
// Copy inlined function's basic blocks into a vector of basic blocks that
// will be inserted in the caller function (the inlined instance). Also, we
// keep a mapping from basic block index to the corresponding block in the
// inlined instance.
std::vector<std::unique_ptr<BinaryBasicBlock>> InlinedInstance;
std::unordered_map<const BinaryBasicBlock *, BinaryBasicBlock *> InlinedBBMap;
for (const auto InlinedFunctionBB : InlinedFunction.layout()) {
InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0));
InlinedBBMap[InlinedFunctionBB] = InlinedInstance.back().get();
if (InlinedFunction.hasValidProfile()) {
const auto Count = InlinedFunctionBB->getExecutionCount();
InlinedInstance.back()->setExecutionCount(Count);
}
}
if (ShouldSplitCallerBB) {
// Add one extra block at the inlined instance for the removed part of the
// caller block.
InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0));
if (CallerFunction.hasValidProfile()) {
const auto Count = CallerBB->getExecutionCount();
InlinedInstance.back()->setExecutionCount(Count);
}
}
// Copy instructions to the basic blocks of the inlined instance.
bool First = true;
for (const auto InlinedFunctionBB : InlinedFunction.layout()) {
// Get the corresponding block of the inlined instance.
auto *InlinedInstanceBB = InlinedBBMap.at(InlinedFunctionBB);
bool IsExitingBlock = false;
// Copy instructions into the inlined instance.
for (auto Instruction : *InlinedFunctionBB) {
if (!IsTailCall &&
BC.MIA->isReturn(Instruction) &&
!BC.MIA->isTailCall(Instruction)) {
// Skip returns when the caller does a normal call as opposed to a tail
// call.
IsExitingBlock = true;
continue;
}
if (!IsTailCall &&
BC.MIA->isTailCall(Instruction)) {
// Convert tail calls to normal calls when the caller does a normal
// call.
if (!BC.MIA->convertTailCallToCall(Instruction))
assert(false && "unexpected tail call opcode found");
IsExitingBlock = true;
}
if (BC.MIA->isBranch(Instruction) &&
!BC.MIA->isIndirectBranch(Instruction)) {
// Convert the branch targets in the branch instructions that will be
// added to the inlined instance.
const MCSymbol *OldTargetLabel = nullptr;
const MCSymbol *OldFTLabel = nullptr;
MCInst *CondBranch = nullptr;
MCInst *UncondBranch = nullptr;
const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel,
OldFTLabel, CondBranch,
UncondBranch);
assert(Result &&
"analyzeBranch failed on instruction guaranteed to be a branch");
assert(OldTargetLabel);
const MCSymbol *NewTargetLabel = nullptr;
for (const auto SuccBB : InlinedFunctionBB->successors()) {
if (SuccBB->getLabel() == OldTargetLabel) {
NewTargetLabel = InlinedBBMap.at(SuccBB)->getLabel();
break;
}
}
assert(NewTargetLabel);
BC.MIA->replaceBranchTarget(Instruction, NewTargetLabel, BC.Ctx.get());
}
// TODO; Currently we simply ignore CFI instructions but we need to
// address them for correctness.
if (!BC.MIA->isEHLabel(Instruction) &&
!BC.MIA->isCFI(Instruction)) {
InlinedInstanceBB->addInstruction(std::move(Instruction));
}
}
// Add CFG edges to the basic blocks of the inlined instance.
std::vector<BinaryBasicBlock *>
Successors(InlinedFunctionBB->succ_size(), nullptr);
std::transform(
InlinedFunctionBB->succ_begin(),
InlinedFunctionBB->succ_end(),
Successors.begin(),
[&InlinedBBMap](const BinaryBasicBlock *BB) {
return InlinedBBMap.at(BB);
});
if (InlinedFunction.hasValidProfile()) {
InlinedInstanceBB->addSuccessors(
Successors.begin(),
Successors.end(),
InlinedFunctionBB->branch_info_begin(),
InlinedFunctionBB->branch_info_end());
} else {
InlinedInstanceBB->addSuccessors(
Successors.begin(),
Successors.end());
}
if (IsExitingBlock) {
assert(Successors.size() == 0);
if (ShouldSplitCallerBB) {
if (InlinedFunction.hasValidProfile()) {
InlinedInstanceBB->addSuccessor(
InlinedInstance.back().get(),
InlinedInstanceBB->getExecutionCount());
} else {
InlinedInstanceBB->addSuccessor(InlinedInstance.back().get());
}
InlinedInstanceBB->addBranchInstruction(InlinedInstance.back().get());
} else if (!First || !CanMergeFirstInlinedBlock) {
assert(CallInstIndex == CallerBB->size() - 1);
assert(CallerBB->succ_size() <= 1);
if (CallerBB->succ_size() == 1) {
if (InlinedFunction.hasValidProfile()) {
InlinedInstanceBB->addSuccessor(
*CallerBB->succ_begin(),
InlinedInstanceBB->getExecutionCount());
} else {
InlinedInstanceBB->addSuccessor(*CallerBB->succ_begin());
}
InlinedInstanceBB->addBranchInstruction(*CallerBB->succ_begin());
}
}
}
First = false;
}
if (ShouldSplitCallerBB) {
// Split the basic block that contains the call and add the removed
// instructions in the last block of the inlined instance.
// (Is it OK to have a basic block with just CFI instructions?)
std::vector<MCInst> TrailInstructions =
CallerBB->splitInstructions(&CallInst);
assert(TrailInstructions.size() > 0);
InlinedInstance.back()->addInstructions(
TrailInstructions.begin(),
TrailInstructions.end());
// Add CFG edges for the block with the removed instructions.
if (CallerFunction.hasValidProfile()) {
InlinedInstance.back()->addSuccessors(
CallerBB->succ_begin(),
CallerBB->succ_end(),
CallerBB->branch_info_begin(),
CallerBB->branch_info_end());
} else {
InlinedInstance.back()->addSuccessors(
CallerBB->succ_begin(),
CallerBB->succ_end());
}
// Update the after-inlined point.
AfterInlinedBB = InlinedInstance.back().get();
AfterInlinedIstrIndex = 0;
}
assert(InlinedInstance.size() > 0 && "found function with no basic blocks");
assert(InlinedInstance.front()->size() > 0 &&
"found function with empty basic block");
// If the inlining cannot happen as a simple instruction insertion into
// CallerBB, we remove the outgoing CFG edges of the caller block.
if (InlinedInstance.size() > 1 || !CanMergeFirstInlinedBlock) {
CallerBB->removeSuccessors(CallerBB->succ_begin(), CallerBB->succ_end());
if (!ShouldSplitCallerBB) {
// Update the after-inlined point.
AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB);
AfterInlinedIstrIndex = 0;
}
} else {
assert(!ShouldSplitCallerBB);
// Update the after-inlined point.
if (CallInstIndex < CallerBB->size() - 1) {
AfterInlinedBB = CallerBB;
AfterInlinedIstrIndex =
CallInstIndex + InlinedInstance.front()->size();
} else {
AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB);
AfterInlinedIstrIndex = 0;
}
}
// Do the inlining by merging the first block of the inlined instance into
// the caller basic block if possible and adding the rest of the inlined
// instance basic blocks in the caller function.
if (CanMergeFirstInlinedBlock) {
CallerBB->replaceInstruction(
&CallInst,
InlinedInstance.front()->begin(),
InlinedInstance.front()->end());
if (InlinedInstance.size() > 1) {
auto FirstBB = InlinedInstance.begin()->get();
if (InlinedFunction.hasValidProfile()) {
CallerBB->addSuccessors(
FirstBB->succ_begin(),
FirstBB->succ_end(),
FirstBB->branch_info_begin(),
FirstBB->branch_info_end());
} else {
CallerBB->addSuccessors(
FirstBB->succ_begin(),
FirstBB->succ_end());
}
FirstBB->removeSuccessors(FirstBB->succ_begin(), FirstBB->succ_end());
}
InlinedInstance.erase(InlinedInstance.begin());
} else {
CallerBB->eraseInstruction(&CallInst);
if (CallerFunction.hasValidProfile()) {
CallerBB->addSuccessor(InlinedInstance.front().get(),
CallerBB->getExecutionCount());
} else {
CallerBB->addSuccessor(InlinedInstance.front().get(),
CallerBB->getExecutionCount());
}
}
CallerFunction.insertBasicBlocks(CallerBB, std::move(InlinedInstance));
return std::make_pair(AfterInlinedBB, AfterInlinedIstrIndex);
}
bool InlineSmallFunctions::inlineCallsInFunction(
BinaryContext &BC,
BinaryFunction &Function) {
std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
Function.layout().end());
std::sort(Blocks.begin(), Blocks.end(),
[](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
return BB1->getExecutionCount() > BB2->getExecutionCount();
});
uint32_t ExtraSize = 0;
for (auto BB : Blocks) {
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
auto &Inst = *InstIt;
if (BC.MIA->isCall(Inst)) {
TotalDynamicCalls += BB->getExecutionCount();
}
}
}
bool DidInlining = false;
for (auto BB : Blocks) {
if (BB->isCold())
continue;
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ) {
auto &Inst = *InstIt;
if (BC.MIA->isCall(Inst) &&
!BC.MIA->isTailCall(Inst) &&
Inst.size() == 1 &&
Inst.getOperand(0).isExpr()) {
const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
assert(TargetSymbol && "target symbol expected for direct call");
const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
if (TargetFunction) {
bool CallToInlineableFunction =
InliningCandidates.count(TargetFunction);
TotalInlineableCalls +=
CallToInlineableFunction * BB->getExecutionCount();
if (CallToInlineableFunction &&
TargetFunction->getSize() + ExtraSize
+ Function.estimateHotSize() < Function.getMaxSize()) {
auto NextInstIt = std::next(InstIt);
inlineCall(BC, *BB, &Inst, *TargetFunction->begin());
DidInlining = true;
DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to "
<< *TargetFunction << " in "
<< Function << "\n");
InstIt = NextInstIt;
ExtraSize += TargetFunction->getSize();
InlinedDynamicCalls += BB->getExecutionCount();
continue;
}
}
}
++InstIt;
}
}
return DidInlining;
}
bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
BinaryContext &BC,
BinaryFunction &Function) {
std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
Function.layout().end());
std::sort(Blocks.begin(), Blocks.end(),
[](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
return BB1->getExecutionCount() > BB2->getExecutionCount();
});
uint32_t ExtraSize = 0;
for (auto BB : Blocks) {
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
auto &Inst = *InstIt;
if (BC.MIA->isCall(Inst)) {
TotalDynamicCalls += BB->getExecutionCount();
}
}
}
bool DidInlining = false;
for (auto BB : Blocks) {
if (BB->isCold())
continue;
unsigned InstIndex = 0;
for (auto InstIt = BB->begin(); InstIt != BB->end(); ) {
auto &Inst = *InstIt;
if (BC.MIA->isCall(Inst) &&
Inst.size() == 1 &&
Inst.getOperand(0).isExpr()) {
assert(!BC.MIA->isInvoke(Inst));
const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
assert(TargetSymbol && "target symbol expected for direct call");
const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
if (TargetFunction) {
bool CallToInlineableFunction =
InliningCandidates.count(TargetFunction);
TotalInlineableCalls +=
CallToInlineableFunction * BB->getExecutionCount();
if (CallToInlineableFunction &&
TargetFunction->getSize() + ExtraSize
+ Function.estimateHotSize() < Function.getMaxSize()) {
unsigned NextInstIndex = 0;
BinaryBasicBlock *NextBB = nullptr;
std::tie(NextBB, NextInstIndex) =
inlineCall(BC, Function, BB, InstIndex, *TargetFunction);
DidInlining = true;
DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to "
<< *TargetFunction << " in "
<< Function << "\n");
InstIndex = NextBB == BB ? NextInstIndex : BB->size();
InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end();
ExtraSize += TargetFunction->getSize();
InlinedDynamicCalls += BB->getExecutionCount();
continue;
}
}
}
++InstIndex;
++InstIt;
}
}
return DidInlining;
}
bool InlineSmallFunctions::mustConsider(const BinaryFunction &BF) {
for (auto &Name : opts::ForceInlineFunctions) {
if (BF.hasName(Name))
return true;
}
return false;
}
void InlineSmallFunctions::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
if (opts::AggressiveInlining)
findInliningCandidatesAggressive(BC, BFs);
else
findInliningCandidates(BC, BFs);
std::vector<BinaryFunction *> ConsideredFunctions;
for (auto &It : BFs) {
auto &Function = It.second;
if (!shouldOptimize(Function) ||
(Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE &&
!mustConsider(Function)))
continue;
ConsideredFunctions.push_back(&Function);
}
std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(),
[](BinaryFunction *A, BinaryFunction *B) {
return B->getExecutionCount() < A->getExecutionCount();
});
unsigned ModifiedFunctions = 0;
for (unsigned i = 0; i < ConsideredFunctions.size() &&
ModifiedFunctions <= kMaxFunctions; ++i) {
auto &Function = *ConsideredFunctions[i];
const bool DidInline = opts::AggressiveInlining
? inlineCallsInFunctionAggressive(BC, Function)
: inlineCallsInFunction(BC, Function);
if (DidInline) {
Modified.insert(&Function);
++ModifiedFunctions;
}
}
DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of "
<< TotalDynamicCalls << " function calls in the profile.\n"
<< "BOLT-INFO: Inlined calls represent "
<< format("%.1f",
100.0 * InlinedDynamicCalls / TotalInlineableCalls)
<< "% of all inlineable calls in the profile.\n");
}
} // namespace bolt
} // namespace llvm

102
bolt/Passes/Inliner.h Normal file
View File

@ -0,0 +1,102 @@
//===--- Passes/Inliner.h - Inlining infra for BOLT -----------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// The set of optimization/analysis passes that run on BinaryFunctions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
#include "BinaryContext.h"
#include "BinaryFunction.h"
#include "BinaryPasses.h"
namespace llvm {
namespace bolt {
/// Inlining of single basic block functions.
/// The pass currently does not handle CFI instructions. This is needed for
/// correctness and we may break exception handling because of this.
class InlineSmallFunctions : public BinaryFunctionPass {
private:
std::set<const BinaryFunction *> InliningCandidates;
/// Maximum number of instructions in an inlined function.
static const unsigned kMaxInstructions = 8;
/// Maximum code size (in bytes) of inlined function (used by aggressive
/// inlining).
static const uint64_t kMaxSize = 60;
/// Maximum number of functions that will be considered for inlining (in
/// descending hottness order).
static const unsigned kMaxFunctions = 30000;
/// Statistics collected for debugging.
uint64_t TotalDynamicCalls = 0;
uint64_t InlinedDynamicCalls = 0;
uint64_t TotalInlineableCalls = 0;
std::unordered_set<const BinaryFunction *> Modified;
static bool mustConsider(const BinaryFunction &BF);
void findInliningCandidates(BinaryContext &BC,
const std::map<uint64_t, BinaryFunction> &BFs);
/// Inline the call in CallInst to InlinedFunctionBB (the only BB of the
/// called function).
void inlineCall(BinaryContext &BC,
BinaryBasicBlock &BB,
MCInst *CallInst,
const BinaryBasicBlock &InlinedFunctionBB);
bool inlineCallsInFunction(BinaryContext &BC,
BinaryFunction &Function);
/// The following methods do a more aggressive inlining pass, where we
/// inline calls as well as tail calls and we are not limited to inlining
/// functions with only one basic block.
/// FIXME: Currently these are broken since they do not work with the split
/// function option.
void findInliningCandidatesAggressive(
BinaryContext &BC, const std::map<uint64_t, BinaryFunction> &BFs);
bool inlineCallsInFunctionAggressive(
BinaryContext &BC, BinaryFunction &Function);
/// Inline the call in CallInst to InlinedFunction. Inlined function should not
/// contain any landing pad or thrower edges but can have more than one blocks.
///
/// Return the location (basic block and instruction index) where the code of
/// the caller function continues after the the inlined code.
std::pair<BinaryBasicBlock *, unsigned>
inlineCall(BinaryContext &BC,
BinaryFunction &CallerFunction,
BinaryBasicBlock *CallerBB,
const unsigned CallInstIdex,
const BinaryFunction &InlinedFunction);
public:
explicit InlineSmallFunctions(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
const char *getName() const override {
return "inlining";
}
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt
} // namespace llvm
#endif

View File

@ -0,0 +1,698 @@
//===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Implements different basic block reordering algorithms.
//
//===----------------------------------------------------------------------===//
#include "ReorderAlgorithm.h"
#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "llvm/Support/CommandLine.h"
#include <queue>
#include <functional>
#undef DEBUG_TYPE
#define DEBUG_TYPE "bolt"
using namespace llvm;
using namespace bolt;
namespace opts {
static cl::opt<bool>
PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore);
static cl::opt<uint32_t>
RandomSeed("bolt-seed",
cl::desc("seed for randomization"),
cl::init(42),
cl::ZeroOrMore);
} // namespace opts
namespace {
template <class T>
inline void hashCombine(size_t &Seed, const T &Val) {
std::hash<T> Hasher;
Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
}
template <typename A, typename B>
struct HashPair {
size_t operator()(const std::pair<A,B>& Val) const {
std::hash<A> Hasher;
size_t Seed = Hasher(Val.first);
hashCombine(Seed, Val.second);
return Seed;
}
};
}
void ClusterAlgorithm::computeClusterAverageFrequency() {
AvgFreq.resize(Clusters.size(), 0.0);
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
double Freq = 0.0;
for (auto BB : Clusters[I]) {
if (BB->getNumNonPseudos() > 0)
Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos();
}
AvgFreq[I] = Freq;
}
}
void ClusterAlgorithm::printClusters() const {
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
errs() << "Cluster number " << I;
if (AvgFreq.size() == Clusters.size())
errs() << " (frequency: " << AvgFreq[I] << ")";
errs() << " : ";
auto Sep = "";
for (auto BB : Clusters[I]) {
errs() << Sep << BB->getName();
Sep = ", ";
}
errs() << "\n";
}
}
void ClusterAlgorithm::reset() {
Clusters.clear();
ClusterEdges.clear();
AvgFreq.clear();
}
void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
}
size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
return Hasher(std::make_pair(E.Src, E.Dst));
}
bool GreedyClusterAlgorithm::EdgeEqual::operator()(
const EdgeTy &A, const EdgeTy &B) const {
return A.Src == B.Src && A.Dst == B.Dst;
}
void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
bool ComputeEdges) {
reset();
// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
// maximize weight during a path traversing all BBs. In this way, we will
// convert the hottest branches into fall-throughs.
// This is the queue of edges from which we will pop edges and use them to
// cluster basic blocks in a greedy fashion.
std::vector<EdgeTy> Queue;
// Initialize inter-cluster weights.
if (ComputeEdges)
ClusterEdges.resize(BF.layout_size());
// Initialize clusters and edge queue.
for (auto BB : BF.layout()) {
// Create a cluster for this BB.
uint32_t I = Clusters.size();
Clusters.emplace_back();
auto &Cluster = Clusters.back();
Cluster.push_back(BB);
BBToClusterMap[BB] = I;
// Populate priority queue with edges.
auto BI = BB->branch_info_begin();
for (auto &I : BB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
Queue.emplace_back(EdgeTy(BB, I, BI->Count));
++BI;
}
}
// Sort and adjust the edge queue.
initQueue(Queue, BF);
// Grow clusters in a greedy fashion.
while (!Queue.empty()) {
auto E = Queue.back();
Queue.pop_back();
const auto *SrcBB = E.Src;
const auto *DstBB = E.Dst;
DEBUG(dbgs() << "Popped edge ";
E.print(dbgs());
dbgs() << "\n");
// Case 1: BBSrc and BBDst are the same. Ignore this edge
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
continue;
}
int I = BBToClusterMap[SrcBB];
int J = BBToClusterMap[DstBB];
// Case 2: If they are already allocated at the same cluster, just increase
// the weight of this cluster
if (I == J) {
if (ComputeEdges)
ClusterEdges[I][I] += E.Count;
DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
continue;
}
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
if (areClustersCompatible(ClusterA, ClusterB, E)) {
// Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
// allowing us to merge two clusters.
for (auto BB : ClusterB)
BBToClusterMap[BB] = I;
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
ClusterB.clear();
if (ComputeEdges) {
// Increase the intra-cluster edge count of cluster A with the count of
// this edge as well as with the total count of previously visited edges
// from cluster B cluster A.
ClusterEdges[I][I] += E.Count;
ClusterEdges[I][I] += ClusterEdges[J][I];
// Iterate through all inter-cluster edges and transfer edges targeting
// cluster B to cluster A.
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
ClusterEdges[K][I] += ClusterEdges[K][J];
}
// Adjust the weights of the remaining edges and re-sort the queue.
adjustQueue(Queue, BF);
DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
} else {
// Case 4: Both SrcBB and DstBB are allocated in positions we cannot
// merge them. Add the count of this edge to the inter-cluster edge count
// between clusters A and B to help us decide ordering between these
// clusters.
if (ComputeEdges)
ClusterEdges[I][J] += E.Count;
DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
}
}
}
void GreedyClusterAlgorithm::reset() {
ClusterAlgorithm::reset();
BBToClusterMap.clear();
}
void PHGreedyClusterAlgorithm::initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Define a comparison function to establish SWO between edges.
auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
// With equal weights, prioritize branches with lower index
// source/destination. This helps to keep original block order for blocks
// when optimal order cannot be deducted from a profile.
if (A.Count == B.Count) {
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
return (SrcOrder != 0)
? SrcOrder > 0
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
}
return A.Count < B.Count;
};
// Sort edges in increasing profile count order.
std::sort(Queue.begin(), Queue.end(), Comp);
}
void PHGreedyClusterAlgorithm::adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Nothing to do.
return;
}
bool PHGreedyClusterAlgorithm::areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
return Front.back() == E.Src && Back.front() == E.Dst;
}
int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
const EdgeTy &E, const BinaryFunction &BF) const {
const BinaryBasicBlock *SrcBB = E.Src;
const BinaryBasicBlock *DstBB = E.Dst;
// Initial weight value.
int64_t W = (int64_t)E.Count;
// Adjust the weight by taking into account other edges with the same source.
auto BI = SrcBB->branch_info_begin();
for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
"overflow detected");
// Ignore edges with same source and destination, edges that target the
// entry block as well as the edge E itself.
if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
W -= (int64_t)BI->Count;
++BI;
}
// Adjust the weight by taking into account other edges with the same
// destination.
for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
// Ignore edges with same source and destination as well as the edge E
// itself.
if (PredBB == DstBB || PredBB == SrcBB)
continue;
auto BI = PredBB->branch_info_begin();
for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
if (SuccBB == DstBB)
break;
++BI;
}
assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
"overflow detected");
W -= (int64_t)BI->Count;
}
return W;
}
void MinBranchGreedyClusterAlgorithm::initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Initialize edge weights.
for (const EdgeTy &E : Queue)
Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
// Sort edges in increasing weight order.
adjustQueue(Queue, BF);
}
void MinBranchGreedyClusterAlgorithm::adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Define a comparison function to establish SWO between edges.
auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
// With equal weights, prioritize branches with lower index
// source/destination. This helps to keep original block order for blocks
// when optimal order cannot be deduced from a profile.
if (Weight[A] == Weight[B]) {
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
return (SrcOrder != 0)
? SrcOrder > 0
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
}
return Weight[A] < Weight[B];
};
// Iterate through all remaining edges to find edges that have their
// source and destination in the same cluster.
std::vector<EdgeTy> NewQueue;
for (const EdgeTy &E : Queue) {
const auto *SrcBB = E.Src;
const auto *DstBB = E.Dst;
// Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
// this edge.
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
E.print(dbgs());
dbgs() << " (same src, dst)\n");
continue;
}
int I = BBToClusterMap[SrcBB];
int J = BBToClusterMap[DstBB];
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
// Case 2: They are already allocated at the same cluster or incompatible
// clusters. Adjust the weights of edges with the same source or
// destination, so that this edge has no effect on them any more, and ignore
// this edge. Also increase the intra- (or inter-) cluster edge count.
if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
if (!ClusterEdges.empty())
ClusterEdges[I][J] += E.Count;
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
E.print(dbgs());
dbgs() << " (src, dst belong to same cluster or incompatible "
"clusters)\n");
for (const auto *SuccBB : SrcBB->successors()) {
if (SuccBB == DstBB)
continue;
auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
assert(WI != Weight.end() && "CFG edge not found in Weight map");
WI->second += (int64_t)E.Count;
}
for (const auto *PredBB : DstBB->predecessors()) {
if (PredBB == SrcBB)
continue;
auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
assert(WI != Weight.end() && "CFG edge not found in Weight map");
WI->second += (int64_t)E.Count;
}
continue;
}
// Case 3: None of the previous cases is true, so just keep this edge in
// the queue.
NewQueue.emplace_back(E);
}
// Sort remaining edges in increasing weight order.
Queue.swap(NewQueue);
std::sort(Queue.begin(), Queue.end(), Comp);
}
bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
return Front.back() == E.Src && Back.front() == E.Dst;
}
void MinBranchGreedyClusterAlgorithm::reset() {
GreedyClusterAlgorithm::reset();
Weight.clear();
}
void OptimalReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
std::vector<std::vector<uint64_t>> Weight;
std::unordered_map<const BinaryBasicBlock *, int> BBToIndex;
std::vector<BinaryBasicBlock *> IndexToBB;
unsigned N = BF.layout_size();
// Populating weight map and index map
for (auto BB : BF.layout()) {
BBToIndex[BB] = IndexToBB.size();
IndexToBB.push_back(BB);
}
Weight.resize(N);
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
Weight[BBToIndex[BB]].resize(N);
for (auto I : BB->successors()) {
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
++BI;
}
}
std::vector<std::vector<int64_t>> DP;
DP.resize(1 << N);
for (auto &Elmt : DP) {
Elmt.resize(N, -1);
}
// Start with the entry basic block being allocated with cost zero
DP[1][0] = 0;
// Walk through TSP solutions using a bitmask to represent state (current set
// of BBs in the layout)
unsigned BestSet = 1;
unsigned BestLast = 0;
int64_t BestWeight = 0;
for (unsigned Set = 1; Set < (1U << N); ++Set) {
// Traverse each possibility of Last BB visited in this layout
for (unsigned Last = 0; Last < N; ++Last) {
// Case 1: There is no possible layout with this BB as Last
if (DP[Set][Last] == -1)
continue;
// Case 2: There is a layout with this Set and this Last, and we try
// to expand this set with New
for (unsigned New = 1; New < N; ++New) {
// Case 2a: BB "New" is already in this Set
if ((Set & (1 << New)) != 0)
continue;
// Case 2b: BB "New" is not in this set and we add it to this Set and
// record total weight of this layout with "New" as the last BB.
unsigned NewSet = (Set | (1 << New));
if (DP[NewSet][New] == -1)
DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
DP[NewSet][New] = std::max(DP[NewSet][New],
DP[Set][Last] + (int64_t)Weight[Last][New]);
if (DP[NewSet][New] > BestWeight) {
BestWeight = DP[NewSet][New];
BestSet = NewSet;
BestLast = New;
}
}
}
}
// Define final function layout based on layout that maximizes weight
unsigned Last = BestLast;
unsigned Set = BestSet;
std::vector<bool> Visited;
Visited.resize(N);
Visited[Last] = true;
Order.push_back(IndexToBB[Last]);
Set = Set & ~(1U << Last);
while (Set != 0) {
int64_t Best = -1;
for (unsigned I = 0; I < N; ++I) {
if (DP[Set][I] == -1)
continue;
if (DP[Set][I] > Best) {
Last = I;
Best = DP[Set][I];
}
}
Visited[Last] = true;
Order.push_back(IndexToBB[Last]);
Set = Set & ~(1U << Last);
}
std::reverse(Order.begin(), Order.end());
// Finalize layout with BBs that weren't assigned to the layout
for (auto BB : BF.layout()) {
if (Visited[BBToIndex[BB]] == false)
Order.push_back(BB);
}
}
void OptimizeReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
if (opts::PrintClusters)
CAlgo->printClusters();
// Arrange basic blocks according to clusters.
for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
auto &ClusterEdges = CAlgo->ClusterEdges;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Do a topological sort for clusters, prioritizing frequently-executed BBs
// during the traversal.
std::stack<uint32_t> Stack;
std::vector<uint32_t> Status;
std::vector<uint32_t> Parent;
Status.resize(Clusters.size(), 0);
Parent.resize(Clusters.size(), 0);
constexpr uint32_t STACKED = 1;
constexpr uint32_t VISITED = 2;
Status[0] = STACKED;
Stack.push(0);
while (!Stack.empty()) {
uint32_t I = Stack.top();
if (!(Status[I] & VISITED)) {
Status[I] |= VISITED;
// Order successors by weight
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
return ClusterEdges[I][A] > ClusterEdges[I][B];
};
std::priority_queue<uint32_t, std::vector<uint32_t>,
decltype(ClusterComp)> SuccQueue(ClusterComp);
for (auto &Target: ClusterEdges[I]) {
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
!Clusters[Target.first].empty()) {
Parent[Target.first] = I;
Status[Target.first] = STACKED;
SuccQueue.push(Target.first);
}
}
while (!SuccQueue.empty()) {
Stack.push(SuccQueue.top());
SuccQueue.pop();
}
continue;
}
// Already visited this node
Stack.pop();
ClusterOrder.push_back(I);
}
std::reverse(ClusterOrder.begin(), ClusterOrder.end());
// Put unreachable clusters at the end
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!(Status[I] & VISITED) && !Clusters[I].empty())
ClusterOrder.push_back(I);
// Sort nodes with equal precedence
auto Beg = ClusterOrder.begin();
// Don't reorder the first cluster, which contains the function entry point
++Beg;
std::stable_sort(Beg, ClusterOrder.end(),
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
uint32_t P = Parent[A];
while (Parent[P] != 0) {
if (Parent[P] == B)
return false;
P = Parent[P];
}
P = Parent[B];
while (Parent[P] != 0) {
if (Parent[P] == A)
return true;
P = Parent[P];
}
return AvgFreq[A] > AvgFreq[B];
});
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}
void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
ClusterOrder.push_back(I);
// Don't reorder the first cluster, which contains the function entry point
std::stable_sort(std::next(ClusterOrder.begin()),
ClusterOrder.end(),
[&AvgFreq](uint32_t A, uint32_t B) {
return AvgFreq[A] > AvgFreq[B];
});
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}
void ReverseReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
auto FirstBB = *BF.layout_begin();
Order.push_back(FirstBB);
for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
Order.push_back(*RLI);
}
void RandomClusterReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
ClusterOrder.push_back(I);
std::srand(opts::RandomSeed);
std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end());
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}

View File

@ -0,0 +1,268 @@
// Passes/ReorderAlgorithm.h - Interface for basic block reorderng algorithms //
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Interface to different basic block reordering algorithms.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
#include "BinaryFunction.h"
#include "llvm/Support/ErrorHandling.h"
#include <unordered_map>
#include <memory>
#include <vector>
namespace llvm {
class raw_ostream;
namespace bolt {
class BinaryBasicBlock;
class BinaryFunction;
/// Objects of this class implement various basic block clustering algorithms.
/// Basic block clusters are chains of basic blocks that should be laid out
/// in this order to maximize performace. These algorithms group basic blocks
/// into clusters using execution profile data and various heuristics.
class ClusterAlgorithm {
public:
using ClusterTy = std::vector<BinaryBasicBlock *>;
std::vector<ClusterTy> Clusters;
std::vector<std::unordered_map<uint32_t, uint64_t>> ClusterEdges;
std::vector<double> AvgFreq;
/// Group the basic blocks in the given function into clusters stored in the
/// Clusters vector. Also encode relative weights between two clusters in
/// the ClusterEdges vector if requested. This vector is indexed by
/// the clusters indices in the Clusters vector.
virtual void clusterBasicBlocks(const BinaryFunction &BF,
bool ComputeEdges = false) = 0;
/// Compute for each cluster its averagae execution frequency, that is
/// the sum of average frequencies of its blocks (execution count / # instrs).
/// The average frequencies are stored in the AvgFreq vector, index by the
/// cluster indices in the Clusters vector.
void computeClusterAverageFrequency();
/// Clear clusters and related info.
virtual void reset();
void printClusters() const;
virtual ~ClusterAlgorithm() {}
};
/// Base class for a greedy clustering algorithm that selects edges in order
/// based on some heuristic and uses them to join basic blocks into clusters.
class GreedyClusterAlgorithm : public ClusterAlgorithm {
protected:
// Represents an edge between two basic blocks, with source, destination, and
// profile count.
struct EdgeTy {
const BinaryBasicBlock *Src;
const BinaryBasicBlock *Dst;
uint64_t Count;
EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst,
uint64_t Count) :
Src(Src), Dst(Dst), Count(Count) {}
void print(raw_ostream &OS) const;
};
struct EdgeHash {
size_t operator() (const EdgeTy &E) const;
};
struct EdgeEqual {
bool operator() (const EdgeTy &A, const EdgeTy &B) const;
};
// Virtual methods that allow custom specialization of the heuristic used by
// the algorithm to select edges.
virtual void initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
virtual void adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
virtual bool areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0;
// Map from basic block to owning cluster index.
using BBToClusterMapTy = std::unordered_map<const BinaryBasicBlock *,
unsigned>;
BBToClusterMapTy BBToClusterMap;
public:
void clusterBasicBlocks(const BinaryFunction &BF,
bool ComputeEdges = false) override;
void reset() override;
};
/// This clustering algorithm is based on a greedy heuristic suggested by
/// Pettis and Hansen (PLDI '90).
class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
protected:
void initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
void adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
bool areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
override;
};
/// This clustering algorithm is based on a greedy heuristic that is a
/// modification of the heuristic suggested by Pettis (PLDI '90). It is
/// geared towards minimizing branches.
class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
private:
// Map from an edge to its weight which is used by the algorithm to sort the
// edges.
std::unordered_map<EdgeTy, int64_t, EdgeHash, EdgeEqual> Weight;
// The weight of an edge is calculated as the win in branches if we choose
// to layout this edge as a fall-through. For example, consider the edges
// A -> B with execution count 500,
// A -> C with execution count 100, and
// D -> B with execution count 150
// wher B, C are the only successors of A and A, D are thr only predessecors
// of B. Then if we choose to layout edge A -> B as a fallthrough, the win in
// branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B.
int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const;
protected:
void initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
void adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
bool areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
override;
public:
void reset() override;
};
/// Objects of this class implement various basic block reordering alogrithms.
/// Most of these algorithms depend on a clustering alogrithm.
/// Here we have 3 conflicting goals as to how to layout clusters. If we want
/// to minimize jump offsets, we should put clusters with heavy inter-cluster
/// dependence as close as possible. If we want to maximize the probability
/// that all inter-cluster edges are predicted as not-taken, we should enforce
/// a topological order to make targets appear after sources, creating forward
/// branches. If we want to separate hot from cold blocks to maximize the
/// probability that unfrequently executed code doesn't pollute the cache, we
/// should put clusters in descending order of hotness.
class ReorderAlgorithm {
protected:
std::unique_ptr<ClusterAlgorithm> CAlgo;
public:
ReorderAlgorithm() { }
explicit ReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
CAlgo(std::move(CAlgo)) { }
using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
/// Reorder the basic blocks of the given function and store the new order in
/// the new Clusters vector.
virtual void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const = 0;
void setClusterAlgorithm(ClusterAlgorithm *CAlgo) {
this->CAlgo.reset(CAlgo);
}
virtual ~ReorderAlgorithm() { }
};
/// Dynamic programming implementation for the TSP, applied to BB layout. Find
/// the optimal way to maximize weight during a path traversing all BBs. In
/// this way, we will convert the hottest branches into fall-throughs.
///
/// Uses exponential amount of memory on the number of basic blocks and should
/// only be used for small functions.
class OptimalReorderAlgorithm : public ReorderAlgorithm {
public:
void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
};
/// Simple algorithm that groups basic blocks into clusters and then
/// lays them out cluster after cluster.
class OptimizeReorderAlgorithm : public ReorderAlgorithm {
public:
explicit OptimizeReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
ReorderAlgorithm(std::move(CAlgo)) { }
void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
};
/// This reorder algorithm tries to ensure that all inter-cluster edges are
/// predicted as not-taken, by enforcing a topological order to make
/// targets appear after sources, creating forward branches.
class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm {
public:
explicit OptimizeBranchReorderAlgorithm(
std::unique_ptr<ClusterAlgorithm> CAlgo) :
ReorderAlgorithm(std::move(CAlgo)) { }
void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
};
/// This reorder tries to separate hot from cold blocks to maximize the
/// probability that unfrequently executed code doesn't pollute the cache, by
/// putting clusters in descending order of hotness.
class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm {
public:
explicit OptimizeCacheReorderAlgorithm(
std::unique_ptr<ClusterAlgorithm> CAlgo) :
ReorderAlgorithm(std::move(CAlgo)) { }
void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
};
/// Toy example that simply reverses the original basic block order.
class ReverseReorderAlgorithm : public ReorderAlgorithm {
public:
void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
};
/// Create clusters as usual and place them in random order.
class RandomClusterReorderAlgorithm : public ReorderAlgorithm {
public:
explicit RandomClusterReorderAlgorithm(
std::unique_ptr<ClusterAlgorithm> CAlgo) :
ReorderAlgorithm(std::move(CAlgo)) { }
void reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
};
} // namespace bolt
} // namespace llvm
#endif