forked from OSchip/llvm-project
[BOLT] Move BOLT passes under Passes subdirectory (NFC).
Summary: Move passes under Passes subdirectory. Move inlining passes under Passes/Inliner.* (cherry picked from FBD4575832)
This commit is contained in:
parent
f06a1455ea
commit
88244a10bb
|
@ -12,8 +12,8 @@
|
|||
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "ReorderAlgorithm.h"
|
||||
#include "DataReader.h"
|
||||
#include "Passes/ReorderAlgorithm.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
|
||||
#include "llvm/MC/MCAsmInfo.h"
|
||||
|
|
|
@ -10,7 +10,8 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "BinaryPassManager.h"
|
||||
#include "FrameOptimizerPass.h"
|
||||
#include "Passes/FrameOptimizer.h"
|
||||
#include "Passes/Inliner.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H
|
||||
|
||||
#include "BinaryFunction.h"
|
||||
#include "BinaryPasses.h"
|
||||
#include "Passes/BinaryPasses.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include <map>
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
add_subdirectory(merge-fdata)
|
||||
add_subdirectory(Passes)
|
||||
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
${LLVM_TARGETS_TO_BUILD}
|
||||
BOLTPasses
|
||||
CodeGen
|
||||
Core
|
||||
DebugInfoDWARF
|
||||
|
@ -18,13 +20,10 @@ add_llvm_tool(llvm-bolt
|
|||
BinaryBasicBlock.cpp
|
||||
BinaryContext.cpp
|
||||
BinaryFunction.cpp
|
||||
BinaryPasses.cpp
|
||||
BinaryPassManager.cpp
|
||||
DataReader.cpp
|
||||
DebugData.cpp
|
||||
Exceptions.cpp
|
||||
FrameOptimizerPass.cpp
|
||||
RewriteInstance.cpp
|
||||
ReorderAlgorithm.cpp
|
||||
DWARFRewriter.cpp
|
||||
)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,490 @@
|
|||
//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The set of optimization/analysis passes that run on BinaryFunctions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
/// An optimization/analysis pass that runs on functions.
|
||||
class BinaryFunctionPass {
|
||||
const cl::opt<bool> &PrintPass;
|
||||
protected:
|
||||
explicit BinaryFunctionPass(const cl::opt<bool> &PrintPass)
|
||||
: PrintPass(PrintPass) { }
|
||||
|
||||
/// Control whether a specific function should be skipped during
|
||||
/// optimization.
|
||||
bool shouldOptimize(const BinaryFunction &BF) const;
|
||||
public:
|
||||
virtual ~BinaryFunctionPass() = default;
|
||||
|
||||
/// The name of this pass
|
||||
virtual const char *getName() const = 0;
|
||||
|
||||
/// Control whether debug info is printed after this pass is completed.
|
||||
bool printPass() const { return PrintPass; }
|
||||
|
||||
/// Control whether debug info is printed for an individual function after
|
||||
/// this pass is completed (printPass() must have returned true).
|
||||
virtual bool shouldPrint(const BinaryFunction &BF) const;
|
||||
|
||||
/// Execute this pass on the given functions.
|
||||
virtual void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) = 0;
|
||||
};
|
||||
|
||||
/// Detects functions that simply do a tail call when they are called and
|
||||
/// optimizes calls to these functions.
|
||||
class OptimizeBodylessFunctions : public BinaryFunctionPass {
|
||||
private:
|
||||
/// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G,
|
||||
/// thus calls to F can be optimized to calls to G.
|
||||
std::unordered_map<const MCSymbol *, const BinaryFunction *>
|
||||
EquivalentCallTarget;
|
||||
|
||||
void analyze(BinaryFunction &BF,
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
|
||||
void optimizeCalls(BinaryFunction &BF,
|
||||
BinaryContext &BC);
|
||||
|
||||
/// Stats for eliminated calls.
|
||||
uint64_t NumEliminatedCalls{0};
|
||||
uint64_t NumOptimizedCallSites{0};
|
||||
|
||||
public:
|
||||
explicit OptimizeBodylessFunctions(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
const char *getName() const override {
|
||||
return "optimize-bodyless";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Detect and eliminate unreachable basic blocks. We could have those
|
||||
/// filled with nops and they are used for alignment.
|
||||
class EliminateUnreachableBlocks : public BinaryFunctionPass {
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
unsigned DeletedBlocks{0};
|
||||
uint64_t DeletedBytes{0};
|
||||
void runOnFunction(BinaryFunction& Function);
|
||||
public:
|
||||
EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "eliminate-unreachable";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext&,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
// Reorder the basic blocks for each function based on hotness.
|
||||
class ReorderBasicBlocks : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit ReorderBasicBlocks(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "reordering";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override;
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Sync local branches with CFG.
|
||||
class FixupBranches : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit FixupBranches(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "fix-branches";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Fix the CFI state and exception handling information after all other
|
||||
/// passes have completed.
|
||||
class FixupFunctions : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit FixupFunctions(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "fixup-functions";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// An optimization to simplify conditional tail calls by removing
|
||||
/// unnecessary branches.
|
||||
///
|
||||
/// This optimization considers both of the following cases:
|
||||
///
|
||||
/// foo: ...
|
||||
/// jcc L1 original
|
||||
/// ...
|
||||
/// L1: jmp bar # TAILJMP
|
||||
///
|
||||
/// ->
|
||||
///
|
||||
/// foo: ...
|
||||
/// jcc bar iff jcc L1 is expected
|
||||
/// ...
|
||||
///
|
||||
/// L1 is unreachable
|
||||
///
|
||||
/// OR
|
||||
///
|
||||
/// foo: ...
|
||||
/// jcc L2
|
||||
/// L1: jmp dest # TAILJMP
|
||||
/// L2: ...
|
||||
///
|
||||
/// ->
|
||||
///
|
||||
/// foo: jncc dest # TAILJMP
|
||||
/// L2: ...
|
||||
///
|
||||
/// L1 is unreachable
|
||||
///
|
||||
/// For this particular case, the first basic block ends with
|
||||
/// a conditional branch and has two successors, one fall-through
|
||||
/// and one for when the condition is true.
|
||||
/// The target of the conditional is a basic block with a single
|
||||
/// unconditional branch (i.e. tail call) to another function.
|
||||
/// We don't care about the contents of the fall-through block.
|
||||
/// We assume that the target of the conditional branch is the
|
||||
/// first successor.
|
||||
class SimplifyConditionalTailCalls : public BinaryFunctionPass {
|
||||
uint64_t NumCandidateTailCalls{0};
|
||||
uint64_t NumTailCallsPatched{0};
|
||||
uint64_t NumOrigForwardBranches{0};
|
||||
uint64_t NumOrigBackwardBranches{0};
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
|
||||
bool shouldRewriteBranch(const BinaryBasicBlock *PredBB,
|
||||
const MCInst &CondBranch,
|
||||
const BinaryBasicBlock *BB,
|
||||
const bool DirectionFlag);
|
||||
|
||||
uint64_t fixTailCalls(BinaryContext &BC, BinaryFunction &BF);
|
||||
public:
|
||||
explicit SimplifyConditionalTailCalls(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "simplify-conditional-tail-calls";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Perform simple peephole optimizations.
|
||||
class Peepholes : public BinaryFunctionPass {
|
||||
uint64_t NumDoubleJumps{0};
|
||||
uint64_t TailCallTraps{0};
|
||||
|
||||
/// Attempt to use the minimum operand width for arithmetic, branch and
|
||||
/// move instructions.
|
||||
void shortenInstructions(BinaryContext &BC, BinaryFunction &Function);
|
||||
|
||||
/// Replace double jumps with a jump directly to the target, i.e.
|
||||
/// jmp/jcc L1; L1: jmp L2 -> jmp/jcc L2.
|
||||
void fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function);
|
||||
|
||||
/// Add trap instructions immediately after indirect tail calls to prevent
|
||||
/// the processor from decoding instructions immediate following the
|
||||
/// tailcall.
|
||||
void addTailcallTraps(BinaryContext &BC, BinaryFunction &Function);
|
||||
public:
|
||||
explicit Peepholes(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "peepholes";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// An optimization to simplify loads from read-only sections.The pass converts
|
||||
/// load instructions with statically computed target address such as:
|
||||
///
|
||||
/// mov 0x12f(%rip), %eax
|
||||
///
|
||||
/// to their counterparts that use immediate opreands instead of memory loads:
|
||||
///
|
||||
/// mov $0x4007dc, %eax
|
||||
///
|
||||
/// when the target address points somewhere inside a read-only section.
|
||||
///
|
||||
class SimplifyRODataLoads : public BinaryFunctionPass {
|
||||
uint64_t NumLoadsSimplified{0};
|
||||
uint64_t NumDynamicLoadsSimplified{0};
|
||||
uint64_t NumLoadsFound{0};
|
||||
uint64_t NumDynamicLoadsFound{0};
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
|
||||
bool simplifyRODataLoads(BinaryContext &BC, BinaryFunction &BF);
|
||||
|
||||
public:
|
||||
explicit SimplifyRODataLoads(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "simplify-read-only-loads";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// An optimization that replaces references to identical functions with
|
||||
/// references to a single one of them.
|
||||
///
|
||||
class IdenticalCodeFolding : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "identical-code-folding";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
///
|
||||
/// Prints a list of the top 100 functions sorted by a set of
|
||||
/// dyno stats categories.
|
||||
///
|
||||
class PrintSortedBy : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit PrintSortedBy(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "print-sorted-by";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &) const override {
|
||||
return false;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Optimize indirect calls.
|
||||
/// The indirect call promotion pass visits each indirect call and
|
||||
/// examines the BranchData for each. If the most frequent targets
|
||||
/// from that callsite exceed the specified threshold (default 90%),
|
||||
/// the call is promoted. Otherwise, it is ignored. By default,
|
||||
/// only one target is considered at each callsite.
|
||||
///
|
||||
/// When an candidate callsite is processed, we modify the callsite
|
||||
/// to test for the most common call targets before calling through
|
||||
/// the original generic call mechanism.
|
||||
///
|
||||
/// The CFG and layout are modified by ICP.
|
||||
///
|
||||
/// A few new command line options have been added:
|
||||
/// -indirect-call-promotion
|
||||
/// -indirect-call-promotion-threshold=<percentage>
|
||||
/// -indirect-call-promotion-mispredict-threshold=<percentage>
|
||||
/// -indirect-call-promotion-topn=<int>
|
||||
///
|
||||
/// The threshold is the minimum frequency of a call target needed
|
||||
/// before ICP is triggered.
|
||||
///
|
||||
/// The mispredict threshold is used to disable the optimization at
|
||||
/// any callsite where the branch predictor does a good enough job
|
||||
/// that ICP wouldn't help regardless of the frequency of the most
|
||||
/// common target.
|
||||
///
|
||||
/// The topn option controls the number of targets to consider for
|
||||
/// each callsite, e.g. ICP is triggered if topn=2 and the total
|
||||
/// frequency of the top two call targets exceeds the threshold.
|
||||
///
|
||||
/// The minimize code size option controls whether or not the hot
|
||||
/// calls are to registers (callq %r10) or to function addresses
|
||||
/// (callq $foo).
|
||||
///
|
||||
/// Example of ICP:
|
||||
///
|
||||
/// C++ code:
|
||||
///
|
||||
/// int B_count = 0;
|
||||
/// int C_count = 0;
|
||||
///
|
||||
/// struct A { virtual void foo() = 0; }
|
||||
/// struct B : public A { virtual void foo() { ++B_count; }; };
|
||||
/// struct C : public A { virtual void foo() { ++C_count; }; };
|
||||
///
|
||||
/// A* a = ...
|
||||
/// a->foo();
|
||||
/// ...
|
||||
///
|
||||
/// original assembly:
|
||||
///
|
||||
/// B0: 49 8b 07 mov (%r15),%rax
|
||||
/// 4c 89 ff mov %r15,%rdi
|
||||
/// ff 10 callq *(%rax)
|
||||
/// 41 83 e6 01 and $0x1,%r14d
|
||||
/// 4d 89 e6 mov %r12,%r14
|
||||
/// 4c 0f 44 f5 cmove %rbp,%r14
|
||||
/// 4c 89 f7 mov %r14,%rdi
|
||||
/// ...
|
||||
///
|
||||
/// after ICP:
|
||||
///
|
||||
/// B0: 49 8b 07 mov (%r15),%rax
|
||||
/// 4c 89 ff mov %r15,%rdi
|
||||
/// 48 81 38 e0 0b 40 00 cmpq $B::foo,(%rax)
|
||||
/// 75 29 jne B3
|
||||
/// B1: e8 45 03 00 00 callq $B::foo
|
||||
/// B2: 41 83 e6 01 and $0x1,%r14d
|
||||
/// 4d 89 e6 mov %r12,%r14
|
||||
/// 4c 0f 44 f5 cmove %rbp,%r14
|
||||
/// 4c 89 f7 mov %r14,%rdi
|
||||
/// ...
|
||||
///
|
||||
/// B3: ff 10 callq *(%rax)
|
||||
/// eb d6 jmp B2
|
||||
///
|
||||
class IndirectCallPromotion : public BinaryFunctionPass {
|
||||
using BasicBlocksVector = std::vector<std::unique_ptr<BinaryBasicBlock>>;
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
// Total number of calls from all callsites.
|
||||
uint64_t TotalCalls{0};
|
||||
|
||||
// Total number of indirect calls from all callsites.
|
||||
// (a fraction of TotalCalls)
|
||||
uint64_t TotalIndirectCalls{0};
|
||||
|
||||
// Total number of callsites that use indirect calls.
|
||||
// (the total number of callsites is not recorded)
|
||||
uint64_t TotalIndirectCallsites{0};
|
||||
|
||||
// Total number of indirect callsites that are optimized by ICP.
|
||||
// (a fraction of TotalIndirectCallsites)
|
||||
uint64_t TotalOptimizedIndirectCallsites{0};
|
||||
|
||||
// Total number of indirect calls that are optimized by ICP.
|
||||
// (a fraction of TotalCalls)
|
||||
uint64_t TotalNumFrequentCalls{0};
|
||||
|
||||
std::vector<BranchInfo> getCallTargets(BinaryContext &BC,
|
||||
const FuncBranchData &BranchData,
|
||||
const MCInst &Inst) const;
|
||||
|
||||
size_t canPromoteCallsite(const BinaryBasicBlock *BB,
|
||||
const MCInst &Inst,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
uint64_t NumCalls);
|
||||
|
||||
void printCallsiteInfo(const BinaryBasicBlock *BB,
|
||||
const MCInst &Inst,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
const size_t N,
|
||||
uint64_t NumCalls) const;
|
||||
|
||||
std::vector<std::pair<MCSymbol *, uint64_t>>
|
||||
findCallTargetSymbols(BinaryContext &BC,
|
||||
const std::vector<BranchInfo> &Targets,
|
||||
const size_t N) const;
|
||||
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>>
|
||||
rewriteCall(BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const MCInst &CallInst,
|
||||
MCInstrAnalysis::ICPdata &&ICPcode) const;
|
||||
|
||||
BinaryBasicBlock *fixCFG(BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
BinaryBasicBlock *IndCallBlock,
|
||||
const bool IsTailCall,
|
||||
BasicBlocksVector &&NewBBs,
|
||||
const std::vector<BranchInfo> &Targets) const;
|
||||
|
||||
public:
|
||||
explicit IndirectCallPromotion(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const {
|
||||
return "indirect-call-promotion";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Pass for lowering any instructions that we have raised and that have
|
||||
/// to be lowered.
|
||||
class InstructionLowering : public BinaryFunctionPass {
|
||||
public:
|
||||
explicit InstructionLowering(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) {}
|
||||
|
||||
const char *getName() const override {
|
||||
return "inst-lowering";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
|
@ -0,0 +1,8 @@
|
|||
add_llvm_library(LLVMBOLTPasses
|
||||
BinaryPasses.cpp
|
||||
FrameOptimizer.cpp
|
||||
Inliner.cpp
|
||||
ReorderAlgorithm.cpp
|
||||
)
|
||||
|
||||
include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt )
|
|
@ -1,4 +1,4 @@
|
|||
//===--- FrameOptimizerPass.cpp -------------------------------------------===//
|
||||
//===--- Passes/FrameOptimizer.cpp ----------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
|
@ -9,7 +9,7 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "FrameOptimizerPass.h"
|
||||
#include "FrameOptimizer.h"
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
//===--- FrameOptimizerPass.h ---------------------------------------------===//
|
||||
//===--- Passes/FrameOptimizer.h ------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
|
@ -9,8 +9,8 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef FRAMEOPTIMIZERPASS_H
|
||||
#define FRAMEOPTIMIZERPASS_H
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
|
|
@ -0,0 +1,609 @@
|
|||
//===--- Passes/Inliner.cpp - Inlining infra for BOLT ---------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Inliner.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
#define DEBUG_TYPE "bolt-inliner"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
static cl::list<std::string>
|
||||
ForceInlineFunctions("force-inline",
|
||||
cl::CommaSeparated,
|
||||
cl::desc("list of functions to always consider "
|
||||
"for inlining"),
|
||||
cl::value_desc("func1,func2,func3,..."),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
AggressiveInlining("aggressive-inlining",
|
||||
cl::desc("perform aggressive inlining"),
|
||||
cl::ZeroOrMore,
|
||||
cl::Hidden);
|
||||
|
||||
}
|
||||
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
void InlineSmallFunctions::findInliningCandidates(
|
||||
BinaryContext &BC,
|
||||
const std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
for (const auto &BFIt : BFs) {
|
||||
const auto &Function = BFIt.second;
|
||||
if (!shouldOptimize(Function) || Function.size() != 1)
|
||||
continue;
|
||||
auto &BB = *Function.begin();
|
||||
const auto &LastInstruction = *BB.rbegin();
|
||||
// Check if the function is small enough, doesn't do a tail call
|
||||
// and doesn't throw exceptions.
|
||||
if (BB.size() > 0 &&
|
||||
BB.getNumNonPseudos() <= kMaxInstructions &&
|
||||
BB.lp_empty() &&
|
||||
BC.MIA->isReturn(LastInstruction) &&
|
||||
!BC.MIA->isTailCall(LastInstruction)) {
|
||||
InliningCandidates.insert(&Function);
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size()
|
||||
<< " inlineable functions.\n");
|
||||
}
|
||||
|
||||
void InlineSmallFunctions::findInliningCandidatesAggressive(
|
||||
BinaryContext &BC,
|
||||
const std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
std::set<std::string> OverwrittenFunctions = {
|
||||
"_ZN4HPHP13hash_string_iEPKcj",
|
||||
"_ZN4HPHP21hash_string_cs_unsafeEPKcj",
|
||||
"_ZN4HPHP14hash_string_csEPKcj",
|
||||
"_ZN4HPHP20hash_string_i_unsafeEPKcj",
|
||||
"_ZNK4HPHP10StringData10hashHelperEv"
|
||||
};
|
||||
for (const auto &BFIt : BFs) {
|
||||
const auto &Function = BFIt.second;
|
||||
if (!shouldOptimize(Function) ||
|
||||
OverwrittenFunctions.count(Function.getSymbol()->getName()) ||
|
||||
Function.hasEHRanges())
|
||||
continue;
|
||||
uint64_t FunctionSize = 0;
|
||||
for (const auto *BB : Function.layout()) {
|
||||
FunctionSize += BC.computeCodeSize(BB->begin(), BB->end());
|
||||
}
|
||||
assert(FunctionSize > 0 && "found empty function");
|
||||
if (FunctionSize > kMaxSize)
|
||||
continue;
|
||||
bool FoundCFI = false;
|
||||
for (const auto BB : Function.layout()) {
|
||||
for (const auto &Inst : *BB) {
|
||||
if (BC.MIA->isEHLabel(Inst) || BC.MIA->isCFI(Inst)) {
|
||||
FoundCFI = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!FoundCFI)
|
||||
InliningCandidates.insert(&Function);
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size()
|
||||
<< " inlineable functions.\n");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/// Returns whether a function creates a stack frame for itself or not.
|
||||
/// If so, we need to manipulate the stack pointer when calling this function.
|
||||
/// Since we're only inlining very small functions, we return false for now, but
|
||||
/// we could for instance check if the function starts with 'push ebp'.
|
||||
/// TODO generalize this.
|
||||
bool createsStackFrame(const BinaryBasicBlock &) {
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void InlineSmallFunctions::inlineCall(
|
||||
BinaryContext &BC,
|
||||
BinaryBasicBlock &BB,
|
||||
MCInst *CallInst,
|
||||
const BinaryBasicBlock &InlinedFunctionBB) {
|
||||
assert(BC.MIA->isCall(*CallInst) && "Can only inline a call.");
|
||||
assert(BC.MIA->isReturn(*InlinedFunctionBB.rbegin()) &&
|
||||
"Inlined function should end with a return.");
|
||||
|
||||
std::vector<MCInst> InlinedInstance;
|
||||
|
||||
bool ShouldAdjustStack = createsStackFrame(InlinedFunctionBB);
|
||||
|
||||
// Move stack like 'call' would if needed.
|
||||
if (ShouldAdjustStack) {
|
||||
MCInst StackInc;
|
||||
BC.MIA->createStackPointerIncrement(StackInc);
|
||||
InlinedInstance.push_back(StackInc);
|
||||
}
|
||||
|
||||
for (auto Instruction : InlinedFunctionBB) {
|
||||
if (BC.MIA->isReturn(Instruction)) {
|
||||
break;
|
||||
}
|
||||
if (!BC.MIA->isEHLabel(Instruction) &&
|
||||
!BC.MIA->isCFI(Instruction)) {
|
||||
InlinedInstance.push_back(Instruction);
|
||||
}
|
||||
}
|
||||
|
||||
// Move stack pointer like 'ret' would.
|
||||
if (ShouldAdjustStack) {
|
||||
MCInst StackDec;
|
||||
BC.MIA->createStackPointerDecrement(StackDec);
|
||||
InlinedInstance.push_back(StackDec);
|
||||
}
|
||||
|
||||
BB.replaceInstruction(CallInst, InlinedInstance);
|
||||
}
|
||||
|
||||
std::pair<BinaryBasicBlock *, unsigned>
|
||||
InlineSmallFunctions::inlineCall(
|
||||
BinaryContext &BC,
|
||||
BinaryFunction &CallerFunction,
|
||||
BinaryBasicBlock *CallerBB,
|
||||
const unsigned CallInstIndex,
|
||||
const BinaryFunction &InlinedFunction) {
|
||||
// Get the instruction to be replaced with inlined code.
|
||||
MCInst &CallInst = CallerBB->getInstructionAtIndex(CallInstIndex);
|
||||
assert(BC.MIA->isCall(CallInst) && "Can only inline a call.");
|
||||
|
||||
// Point in the function after the inlined code.
|
||||
BinaryBasicBlock *AfterInlinedBB = nullptr;
|
||||
unsigned AfterInlinedIstrIndex = 0;
|
||||
|
||||
// In case of a tail call we should not remove any ret instructions from the
|
||||
// inlined instance.
|
||||
bool IsTailCall = BC.MIA->isTailCall(CallInst);
|
||||
|
||||
// The first block of the function to be inlined can be merged with the caller
|
||||
// basic block. This cannot happen if there are jumps to the first block.
|
||||
bool CanMergeFirstInlinedBlock = (*InlinedFunction.begin()).pred_size() == 0;
|
||||
|
||||
// If the call to be inlined is not at the end of its basic block and we have
|
||||
// to inline more than one basic blocks (or even just one basic block that
|
||||
// cannot be merged into the caller block), then the caller's basic block
|
||||
// should be split.
|
||||
bool ShouldSplitCallerBB =
|
||||
CallInstIndex < CallerBB->size() - 1 &&
|
||||
(InlinedFunction.size() > 1 || !CanMergeFirstInlinedBlock);
|
||||
|
||||
// Copy inlined function's basic blocks into a vector of basic blocks that
|
||||
// will be inserted in the caller function (the inlined instance). Also, we
|
||||
// keep a mapping from basic block index to the corresponding block in the
|
||||
// inlined instance.
|
||||
std::vector<std::unique_ptr<BinaryBasicBlock>> InlinedInstance;
|
||||
std::unordered_map<const BinaryBasicBlock *, BinaryBasicBlock *> InlinedBBMap;
|
||||
|
||||
for (const auto InlinedFunctionBB : InlinedFunction.layout()) {
|
||||
InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0));
|
||||
InlinedBBMap[InlinedFunctionBB] = InlinedInstance.back().get();
|
||||
if (InlinedFunction.hasValidProfile()) {
|
||||
const auto Count = InlinedFunctionBB->getExecutionCount();
|
||||
InlinedInstance.back()->setExecutionCount(Count);
|
||||
}
|
||||
}
|
||||
if (ShouldSplitCallerBB) {
|
||||
// Add one extra block at the inlined instance for the removed part of the
|
||||
// caller block.
|
||||
InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0));
|
||||
if (CallerFunction.hasValidProfile()) {
|
||||
const auto Count = CallerBB->getExecutionCount();
|
||||
InlinedInstance.back()->setExecutionCount(Count);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy instructions to the basic blocks of the inlined instance.
|
||||
bool First = true;
|
||||
for (const auto InlinedFunctionBB : InlinedFunction.layout()) {
|
||||
// Get the corresponding block of the inlined instance.
|
||||
auto *InlinedInstanceBB = InlinedBBMap.at(InlinedFunctionBB);
|
||||
bool IsExitingBlock = false;
|
||||
|
||||
// Copy instructions into the inlined instance.
|
||||
for (auto Instruction : *InlinedFunctionBB) {
|
||||
if (!IsTailCall &&
|
||||
BC.MIA->isReturn(Instruction) &&
|
||||
!BC.MIA->isTailCall(Instruction)) {
|
||||
// Skip returns when the caller does a normal call as opposed to a tail
|
||||
// call.
|
||||
IsExitingBlock = true;
|
||||
continue;
|
||||
}
|
||||
if (!IsTailCall &&
|
||||
BC.MIA->isTailCall(Instruction)) {
|
||||
// Convert tail calls to normal calls when the caller does a normal
|
||||
// call.
|
||||
if (!BC.MIA->convertTailCallToCall(Instruction))
|
||||
assert(false && "unexpected tail call opcode found");
|
||||
IsExitingBlock = true;
|
||||
}
|
||||
if (BC.MIA->isBranch(Instruction) &&
|
||||
!BC.MIA->isIndirectBranch(Instruction)) {
|
||||
// Convert the branch targets in the branch instructions that will be
|
||||
// added to the inlined instance.
|
||||
const MCSymbol *OldTargetLabel = nullptr;
|
||||
const MCSymbol *OldFTLabel = nullptr;
|
||||
MCInst *CondBranch = nullptr;
|
||||
MCInst *UncondBranch = nullptr;
|
||||
const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel,
|
||||
OldFTLabel, CondBranch,
|
||||
UncondBranch);
|
||||
assert(Result &&
|
||||
"analyzeBranch failed on instruction guaranteed to be a branch");
|
||||
assert(OldTargetLabel);
|
||||
const MCSymbol *NewTargetLabel = nullptr;
|
||||
for (const auto SuccBB : InlinedFunctionBB->successors()) {
|
||||
if (SuccBB->getLabel() == OldTargetLabel) {
|
||||
NewTargetLabel = InlinedBBMap.at(SuccBB)->getLabel();
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(NewTargetLabel);
|
||||
BC.MIA->replaceBranchTarget(Instruction, NewTargetLabel, BC.Ctx.get());
|
||||
}
|
||||
// TODO; Currently we simply ignore CFI instructions but we need to
|
||||
// address them for correctness.
|
||||
if (!BC.MIA->isEHLabel(Instruction) &&
|
||||
!BC.MIA->isCFI(Instruction)) {
|
||||
InlinedInstanceBB->addInstruction(std::move(Instruction));
|
||||
}
|
||||
}
|
||||
|
||||
// Add CFG edges to the basic blocks of the inlined instance.
|
||||
std::vector<BinaryBasicBlock *>
|
||||
Successors(InlinedFunctionBB->succ_size(), nullptr);
|
||||
|
||||
std::transform(
|
||||
InlinedFunctionBB->succ_begin(),
|
||||
InlinedFunctionBB->succ_end(),
|
||||
Successors.begin(),
|
||||
[&InlinedBBMap](const BinaryBasicBlock *BB) {
|
||||
return InlinedBBMap.at(BB);
|
||||
});
|
||||
|
||||
if (InlinedFunction.hasValidProfile()) {
|
||||
InlinedInstanceBB->addSuccessors(
|
||||
Successors.begin(),
|
||||
Successors.end(),
|
||||
InlinedFunctionBB->branch_info_begin(),
|
||||
InlinedFunctionBB->branch_info_end());
|
||||
} else {
|
||||
InlinedInstanceBB->addSuccessors(
|
||||
Successors.begin(),
|
||||
Successors.end());
|
||||
}
|
||||
|
||||
if (IsExitingBlock) {
|
||||
assert(Successors.size() == 0);
|
||||
if (ShouldSplitCallerBB) {
|
||||
if (InlinedFunction.hasValidProfile()) {
|
||||
InlinedInstanceBB->addSuccessor(
|
||||
InlinedInstance.back().get(),
|
||||
InlinedInstanceBB->getExecutionCount());
|
||||
} else {
|
||||
InlinedInstanceBB->addSuccessor(InlinedInstance.back().get());
|
||||
}
|
||||
InlinedInstanceBB->addBranchInstruction(InlinedInstance.back().get());
|
||||
} else if (!First || !CanMergeFirstInlinedBlock) {
|
||||
assert(CallInstIndex == CallerBB->size() - 1);
|
||||
assert(CallerBB->succ_size() <= 1);
|
||||
if (CallerBB->succ_size() == 1) {
|
||||
if (InlinedFunction.hasValidProfile()) {
|
||||
InlinedInstanceBB->addSuccessor(
|
||||
*CallerBB->succ_begin(),
|
||||
InlinedInstanceBB->getExecutionCount());
|
||||
} else {
|
||||
InlinedInstanceBB->addSuccessor(*CallerBB->succ_begin());
|
||||
}
|
||||
InlinedInstanceBB->addBranchInstruction(*CallerBB->succ_begin());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
First = false;
|
||||
}
|
||||
|
||||
if (ShouldSplitCallerBB) {
|
||||
// Split the basic block that contains the call and add the removed
|
||||
// instructions in the last block of the inlined instance.
|
||||
// (Is it OK to have a basic block with just CFI instructions?)
|
||||
std::vector<MCInst> TrailInstructions =
|
||||
CallerBB->splitInstructions(&CallInst);
|
||||
assert(TrailInstructions.size() > 0);
|
||||
InlinedInstance.back()->addInstructions(
|
||||
TrailInstructions.begin(),
|
||||
TrailInstructions.end());
|
||||
// Add CFG edges for the block with the removed instructions.
|
||||
if (CallerFunction.hasValidProfile()) {
|
||||
InlinedInstance.back()->addSuccessors(
|
||||
CallerBB->succ_begin(),
|
||||
CallerBB->succ_end(),
|
||||
CallerBB->branch_info_begin(),
|
||||
CallerBB->branch_info_end());
|
||||
} else {
|
||||
InlinedInstance.back()->addSuccessors(
|
||||
CallerBB->succ_begin(),
|
||||
CallerBB->succ_end());
|
||||
}
|
||||
// Update the after-inlined point.
|
||||
AfterInlinedBB = InlinedInstance.back().get();
|
||||
AfterInlinedIstrIndex = 0;
|
||||
}
|
||||
|
||||
assert(InlinedInstance.size() > 0 && "found function with no basic blocks");
|
||||
assert(InlinedInstance.front()->size() > 0 &&
|
||||
"found function with empty basic block");
|
||||
|
||||
// If the inlining cannot happen as a simple instruction insertion into
|
||||
// CallerBB, we remove the outgoing CFG edges of the caller block.
|
||||
if (InlinedInstance.size() > 1 || !CanMergeFirstInlinedBlock) {
|
||||
CallerBB->removeSuccessors(CallerBB->succ_begin(), CallerBB->succ_end());
|
||||
if (!ShouldSplitCallerBB) {
|
||||
// Update the after-inlined point.
|
||||
AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB);
|
||||
AfterInlinedIstrIndex = 0;
|
||||
}
|
||||
} else {
|
||||
assert(!ShouldSplitCallerBB);
|
||||
// Update the after-inlined point.
|
||||
if (CallInstIndex < CallerBB->size() - 1) {
|
||||
AfterInlinedBB = CallerBB;
|
||||
AfterInlinedIstrIndex =
|
||||
CallInstIndex + InlinedInstance.front()->size();
|
||||
} else {
|
||||
AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB);
|
||||
AfterInlinedIstrIndex = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Do the inlining by merging the first block of the inlined instance into
|
||||
// the caller basic block if possible and adding the rest of the inlined
|
||||
// instance basic blocks in the caller function.
|
||||
if (CanMergeFirstInlinedBlock) {
|
||||
CallerBB->replaceInstruction(
|
||||
&CallInst,
|
||||
InlinedInstance.front()->begin(),
|
||||
InlinedInstance.front()->end());
|
||||
if (InlinedInstance.size() > 1) {
|
||||
auto FirstBB = InlinedInstance.begin()->get();
|
||||
if (InlinedFunction.hasValidProfile()) {
|
||||
CallerBB->addSuccessors(
|
||||
FirstBB->succ_begin(),
|
||||
FirstBB->succ_end(),
|
||||
FirstBB->branch_info_begin(),
|
||||
FirstBB->branch_info_end());
|
||||
} else {
|
||||
CallerBB->addSuccessors(
|
||||
FirstBB->succ_begin(),
|
||||
FirstBB->succ_end());
|
||||
}
|
||||
FirstBB->removeSuccessors(FirstBB->succ_begin(), FirstBB->succ_end());
|
||||
}
|
||||
InlinedInstance.erase(InlinedInstance.begin());
|
||||
} else {
|
||||
CallerBB->eraseInstruction(&CallInst);
|
||||
if (CallerFunction.hasValidProfile()) {
|
||||
CallerBB->addSuccessor(InlinedInstance.front().get(),
|
||||
CallerBB->getExecutionCount());
|
||||
} else {
|
||||
CallerBB->addSuccessor(InlinedInstance.front().get(),
|
||||
CallerBB->getExecutionCount());
|
||||
}
|
||||
}
|
||||
CallerFunction.insertBasicBlocks(CallerBB, std::move(InlinedInstance));
|
||||
|
||||
return std::make_pair(AfterInlinedBB, AfterInlinedIstrIndex);
|
||||
}
|
||||
|
||||
bool InlineSmallFunctions::inlineCallsInFunction(
|
||||
BinaryContext &BC,
|
||||
BinaryFunction &Function) {
|
||||
std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
|
||||
Function.layout().end());
|
||||
std::sort(Blocks.begin(), Blocks.end(),
|
||||
[](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
|
||||
return BB1->getExecutionCount() > BB2->getExecutionCount();
|
||||
});
|
||||
uint32_t ExtraSize = 0;
|
||||
|
||||
for (auto BB : Blocks) {
|
||||
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
|
||||
auto &Inst = *InstIt;
|
||||
if (BC.MIA->isCall(Inst)) {
|
||||
TotalDynamicCalls += BB->getExecutionCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool DidInlining = false;
|
||||
|
||||
for (auto BB : Blocks) {
|
||||
if (BB->isCold())
|
||||
continue;
|
||||
|
||||
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ) {
|
||||
auto &Inst = *InstIt;
|
||||
if (BC.MIA->isCall(Inst) &&
|
||||
!BC.MIA->isTailCall(Inst) &&
|
||||
Inst.size() == 1 &&
|
||||
Inst.getOperand(0).isExpr()) {
|
||||
const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
|
||||
assert(TargetSymbol && "target symbol expected for direct call");
|
||||
const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
|
||||
if (TargetFunction) {
|
||||
bool CallToInlineableFunction =
|
||||
InliningCandidates.count(TargetFunction);
|
||||
|
||||
TotalInlineableCalls +=
|
||||
CallToInlineableFunction * BB->getExecutionCount();
|
||||
|
||||
if (CallToInlineableFunction &&
|
||||
TargetFunction->getSize() + ExtraSize
|
||||
+ Function.estimateHotSize() < Function.getMaxSize()) {
|
||||
auto NextInstIt = std::next(InstIt);
|
||||
inlineCall(BC, *BB, &Inst, *TargetFunction->begin());
|
||||
DidInlining = true;
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to "
|
||||
<< *TargetFunction << " in "
|
||||
<< Function << "\n");
|
||||
InstIt = NextInstIt;
|
||||
ExtraSize += TargetFunction->getSize();
|
||||
InlinedDynamicCalls += BB->getExecutionCount();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++InstIt;
|
||||
}
|
||||
}
|
||||
|
||||
return DidInlining;
|
||||
}
|
||||
|
||||
bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
|
||||
BinaryContext &BC,
|
||||
BinaryFunction &Function) {
|
||||
std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
|
||||
Function.layout().end());
|
||||
std::sort(Blocks.begin(), Blocks.end(),
|
||||
[](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
|
||||
return BB1->getExecutionCount() > BB2->getExecutionCount();
|
||||
});
|
||||
uint32_t ExtraSize = 0;
|
||||
|
||||
for (auto BB : Blocks) {
|
||||
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
|
||||
auto &Inst = *InstIt;
|
||||
if (BC.MIA->isCall(Inst)) {
|
||||
TotalDynamicCalls += BB->getExecutionCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool DidInlining = false;
|
||||
|
||||
for (auto BB : Blocks) {
|
||||
if (BB->isCold())
|
||||
continue;
|
||||
|
||||
unsigned InstIndex = 0;
|
||||
for (auto InstIt = BB->begin(); InstIt != BB->end(); ) {
|
||||
auto &Inst = *InstIt;
|
||||
if (BC.MIA->isCall(Inst) &&
|
||||
Inst.size() == 1 &&
|
||||
Inst.getOperand(0).isExpr()) {
|
||||
assert(!BC.MIA->isInvoke(Inst));
|
||||
const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
|
||||
assert(TargetSymbol && "target symbol expected for direct call");
|
||||
const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
|
||||
if (TargetFunction) {
|
||||
bool CallToInlineableFunction =
|
||||
InliningCandidates.count(TargetFunction);
|
||||
|
||||
TotalInlineableCalls +=
|
||||
CallToInlineableFunction * BB->getExecutionCount();
|
||||
|
||||
if (CallToInlineableFunction &&
|
||||
TargetFunction->getSize() + ExtraSize
|
||||
+ Function.estimateHotSize() < Function.getMaxSize()) {
|
||||
unsigned NextInstIndex = 0;
|
||||
BinaryBasicBlock *NextBB = nullptr;
|
||||
std::tie(NextBB, NextInstIndex) =
|
||||
inlineCall(BC, Function, BB, InstIndex, *TargetFunction);
|
||||
DidInlining = true;
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to "
|
||||
<< *TargetFunction << " in "
|
||||
<< Function << "\n");
|
||||
InstIndex = NextBB == BB ? NextInstIndex : BB->size();
|
||||
InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end();
|
||||
ExtraSize += TargetFunction->getSize();
|
||||
InlinedDynamicCalls += BB->getExecutionCount();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++InstIndex;
|
||||
++InstIt;
|
||||
}
|
||||
}
|
||||
|
||||
return DidInlining;
|
||||
}
|
||||
|
||||
bool InlineSmallFunctions::mustConsider(const BinaryFunction &BF) {
|
||||
for (auto &Name : opts::ForceInlineFunctions) {
|
||||
if (BF.hasName(Name))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void InlineSmallFunctions::runOnFunctions(
|
||||
BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &) {
|
||||
|
||||
if (opts::AggressiveInlining)
|
||||
findInliningCandidatesAggressive(BC, BFs);
|
||||
else
|
||||
findInliningCandidates(BC, BFs);
|
||||
|
||||
std::vector<BinaryFunction *> ConsideredFunctions;
|
||||
for (auto &It : BFs) {
|
||||
auto &Function = It.second;
|
||||
if (!shouldOptimize(Function) ||
|
||||
(Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE &&
|
||||
!mustConsider(Function)))
|
||||
continue;
|
||||
ConsideredFunctions.push_back(&Function);
|
||||
}
|
||||
std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(),
|
||||
[](BinaryFunction *A, BinaryFunction *B) {
|
||||
return B->getExecutionCount() < A->getExecutionCount();
|
||||
});
|
||||
unsigned ModifiedFunctions = 0;
|
||||
for (unsigned i = 0; i < ConsideredFunctions.size() &&
|
||||
ModifiedFunctions <= kMaxFunctions; ++i) {
|
||||
auto &Function = *ConsideredFunctions[i];
|
||||
|
||||
const bool DidInline = opts::AggressiveInlining
|
||||
? inlineCallsInFunctionAggressive(BC, Function)
|
||||
: inlineCallsInFunction(BC, Function);
|
||||
|
||||
if (DidInline) {
|
||||
Modified.insert(&Function);
|
||||
++ModifiedFunctions;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of "
|
||||
<< TotalDynamicCalls << " function calls in the profile.\n"
|
||||
<< "BOLT-INFO: Inlined calls represent "
|
||||
<< format("%.1f",
|
||||
100.0 * InlinedDynamicCalls / TotalInlineableCalls)
|
||||
<< "% of all inlineable calls in the profile.\n");
|
||||
}
|
||||
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
|
@ -0,0 +1,102 @@
|
|||
//===--- Passes/Inliner.h - Inlining infra for BOLT -----------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The set of optimization/analysis passes that run on BinaryFunctions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
|
||||
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "BinaryPasses.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
/// Inlining of single basic block functions.
|
||||
/// The pass currently does not handle CFI instructions. This is needed for
|
||||
/// correctness and we may break exception handling because of this.
|
||||
class InlineSmallFunctions : public BinaryFunctionPass {
|
||||
private:
|
||||
std::set<const BinaryFunction *> InliningCandidates;
|
||||
|
||||
/// Maximum number of instructions in an inlined function.
|
||||
static const unsigned kMaxInstructions = 8;
|
||||
/// Maximum code size (in bytes) of inlined function (used by aggressive
|
||||
/// inlining).
|
||||
static const uint64_t kMaxSize = 60;
|
||||
/// Maximum number of functions that will be considered for inlining (in
|
||||
/// descending hottness order).
|
||||
static const unsigned kMaxFunctions = 30000;
|
||||
|
||||
/// Statistics collected for debugging.
|
||||
uint64_t TotalDynamicCalls = 0;
|
||||
uint64_t InlinedDynamicCalls = 0;
|
||||
uint64_t TotalInlineableCalls = 0;
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
|
||||
static bool mustConsider(const BinaryFunction &BF);
|
||||
|
||||
void findInliningCandidates(BinaryContext &BC,
|
||||
const std::map<uint64_t, BinaryFunction> &BFs);
|
||||
|
||||
/// Inline the call in CallInst to InlinedFunctionBB (the only BB of the
|
||||
/// called function).
|
||||
void inlineCall(BinaryContext &BC,
|
||||
BinaryBasicBlock &BB,
|
||||
MCInst *CallInst,
|
||||
const BinaryBasicBlock &InlinedFunctionBB);
|
||||
|
||||
bool inlineCallsInFunction(BinaryContext &BC,
|
||||
BinaryFunction &Function);
|
||||
|
||||
/// The following methods do a more aggressive inlining pass, where we
|
||||
/// inline calls as well as tail calls and we are not limited to inlining
|
||||
/// functions with only one basic block.
|
||||
/// FIXME: Currently these are broken since they do not work with the split
|
||||
/// function option.
|
||||
void findInliningCandidatesAggressive(
|
||||
BinaryContext &BC, const std::map<uint64_t, BinaryFunction> &BFs);
|
||||
|
||||
bool inlineCallsInFunctionAggressive(
|
||||
BinaryContext &BC, BinaryFunction &Function);
|
||||
|
||||
/// Inline the call in CallInst to InlinedFunction. Inlined function should not
|
||||
/// contain any landing pad or thrower edges but can have more than one blocks.
|
||||
///
|
||||
/// Return the location (basic block and instruction index) where the code of
|
||||
/// the caller function continues after the the inlined code.
|
||||
std::pair<BinaryBasicBlock *, unsigned>
|
||||
inlineCall(BinaryContext &BC,
|
||||
BinaryFunction &CallerFunction,
|
||||
BinaryBasicBlock *CallerBB,
|
||||
const unsigned CallInstIdex,
|
||||
const BinaryFunction &InlinedFunction);
|
||||
|
||||
public:
|
||||
explicit InlineSmallFunctions(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "inlining";
|
||||
}
|
||||
bool shouldPrint(const BinaryFunction &BF) const override {
|
||||
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
|
@ -0,0 +1,698 @@
|
|||
//===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Implements different basic block reordering algorithms.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ReorderAlgorithm.h"
|
||||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include <queue>
|
||||
#include <functional>
|
||||
|
||||
#undef DEBUG_TYPE
|
||||
#define DEBUG_TYPE "bolt"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace bolt;
|
||||
|
||||
namespace opts {
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore);
|
||||
|
||||
static cl::opt<uint32_t>
|
||||
RandomSeed("bolt-seed",
|
||||
cl::desc("seed for randomization"),
|
||||
cl::init(42),
|
||||
cl::ZeroOrMore);
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace {
|
||||
|
||||
template <class T>
|
||||
inline void hashCombine(size_t &Seed, const T &Val) {
|
||||
std::hash<T> Hasher;
|
||||
Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
|
||||
}
|
||||
|
||||
template <typename A, typename B>
|
||||
struct HashPair {
|
||||
size_t operator()(const std::pair<A,B>& Val) const {
|
||||
std::hash<A> Hasher;
|
||||
size_t Seed = Hasher(Val.first);
|
||||
hashCombine(Seed, Val.second);
|
||||
return Seed;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::computeClusterAverageFrequency() {
|
||||
AvgFreq.resize(Clusters.size(), 0.0);
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
double Freq = 0.0;
|
||||
for (auto BB : Clusters[I]) {
|
||||
if (BB->getNumNonPseudos() > 0)
|
||||
Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos();
|
||||
}
|
||||
AvgFreq[I] = Freq;
|
||||
}
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::printClusters() const {
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
errs() << "Cluster number " << I;
|
||||
if (AvgFreq.size() == Clusters.size())
|
||||
errs() << " (frequency: " << AvgFreq[I] << ")";
|
||||
errs() << " : ";
|
||||
auto Sep = "";
|
||||
for (auto BB : Clusters[I]) {
|
||||
errs() << Sep << BB->getName();
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::reset() {
|
||||
Clusters.clear();
|
||||
ClusterEdges.clear();
|
||||
AvgFreq.clear();
|
||||
}
|
||||
|
||||
void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
|
||||
OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
|
||||
}
|
||||
|
||||
size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
|
||||
HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
|
||||
return Hasher(std::make_pair(E.Src, E.Dst));
|
||||
}
|
||||
|
||||
bool GreedyClusterAlgorithm::EdgeEqual::operator()(
|
||||
const EdgeTy &A, const EdgeTy &B) const {
|
||||
return A.Src == B.Src && A.Dst == B.Dst;
|
||||
}
|
||||
|
||||
void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
|
||||
bool ComputeEdges) {
|
||||
reset();
|
||||
|
||||
// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
|
||||
// maximize weight during a path traversing all BBs. In this way, we will
|
||||
// convert the hottest branches into fall-throughs.
|
||||
|
||||
// This is the queue of edges from which we will pop edges and use them to
|
||||
// cluster basic blocks in a greedy fashion.
|
||||
std::vector<EdgeTy> Queue;
|
||||
|
||||
// Initialize inter-cluster weights.
|
||||
if (ComputeEdges)
|
||||
ClusterEdges.resize(BF.layout_size());
|
||||
|
||||
// Initialize clusters and edge queue.
|
||||
for (auto BB : BF.layout()) {
|
||||
// Create a cluster for this BB.
|
||||
uint32_t I = Clusters.size();
|
||||
Clusters.emplace_back();
|
||||
auto &Cluster = Clusters.back();
|
||||
Cluster.push_back(BB);
|
||||
BBToClusterMap[BB] = I;
|
||||
// Populate priority queue with edges.
|
||||
auto BI = BB->branch_info_begin();
|
||||
for (auto &I : BB->successors()) {
|
||||
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
|
||||
"attempted reordering blocks of function with no profile data");
|
||||
Queue.emplace_back(EdgeTy(BB, I, BI->Count));
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
// Sort and adjust the edge queue.
|
||||
initQueue(Queue, BF);
|
||||
|
||||
// Grow clusters in a greedy fashion.
|
||||
while (!Queue.empty()) {
|
||||
auto E = Queue.back();
|
||||
Queue.pop_back();
|
||||
|
||||
const auto *SrcBB = E.Src;
|
||||
const auto *DstBB = E.Dst;
|
||||
|
||||
DEBUG(dbgs() << "Popped edge ";
|
||||
E.print(dbgs());
|
||||
dbgs() << "\n");
|
||||
|
||||
// Case 1: BBSrc and BBDst are the same. Ignore this edge
|
||||
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
|
||||
DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
int I = BBToClusterMap[SrcBB];
|
||||
int J = BBToClusterMap[DstBB];
|
||||
|
||||
// Case 2: If they are already allocated at the same cluster, just increase
|
||||
// the weight of this cluster
|
||||
if (I == J) {
|
||||
if (ComputeEdges)
|
||||
ClusterEdges[I][I] += E.Count;
|
||||
DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &ClusterA = Clusters[I];
|
||||
auto &ClusterB = Clusters[J];
|
||||
if (areClustersCompatible(ClusterA, ClusterB, E)) {
|
||||
// Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
|
||||
// allowing us to merge two clusters.
|
||||
for (auto BB : ClusterB)
|
||||
BBToClusterMap[BB] = I;
|
||||
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
|
||||
ClusterB.clear();
|
||||
if (ComputeEdges) {
|
||||
// Increase the intra-cluster edge count of cluster A with the count of
|
||||
// this edge as well as with the total count of previously visited edges
|
||||
// from cluster B cluster A.
|
||||
ClusterEdges[I][I] += E.Count;
|
||||
ClusterEdges[I][I] += ClusterEdges[J][I];
|
||||
// Iterate through all inter-cluster edges and transfer edges targeting
|
||||
// cluster B to cluster A.
|
||||
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
|
||||
ClusterEdges[K][I] += ClusterEdges[K][J];
|
||||
}
|
||||
// Adjust the weights of the remaining edges and re-sort the queue.
|
||||
adjustQueue(Queue, BF);
|
||||
DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
|
||||
} else {
|
||||
// Case 4: Both SrcBB and DstBB are allocated in positions we cannot
|
||||
// merge them. Add the count of this edge to the inter-cluster edge count
|
||||
// between clusters A and B to help us decide ordering between these
|
||||
// clusters.
|
||||
if (ComputeEdges)
|
||||
ClusterEdges[I][J] += E.Count;
|
||||
DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GreedyClusterAlgorithm::reset() {
|
||||
ClusterAlgorithm::reset();
|
||||
BBToClusterMap.clear();
|
||||
}
|
||||
|
||||
void PHGreedyClusterAlgorithm::initQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
|
||||
// Define a comparison function to establish SWO between edges.
|
||||
auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
|
||||
// With equal weights, prioritize branches with lower index
|
||||
// source/destination. This helps to keep original block order for blocks
|
||||
// when optimal order cannot be deducted from a profile.
|
||||
if (A.Count == B.Count) {
|
||||
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
|
||||
return (SrcOrder != 0)
|
||||
? SrcOrder > 0
|
||||
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
|
||||
}
|
||||
return A.Count < B.Count;
|
||||
};
|
||||
|
||||
// Sort edges in increasing profile count order.
|
||||
std::sort(Queue.begin(), Queue.end(), Comp);
|
||||
}
|
||||
|
||||
void PHGreedyClusterAlgorithm::adjustQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
|
||||
// Nothing to do.
|
||||
return;
|
||||
}
|
||||
|
||||
bool PHGreedyClusterAlgorithm::areClustersCompatible(
|
||||
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
|
||||
return Front.back() == E.Src && Back.front() == E.Dst;
|
||||
}
|
||||
|
||||
int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
|
||||
const EdgeTy &E, const BinaryFunction &BF) const {
|
||||
const BinaryBasicBlock *SrcBB = E.Src;
|
||||
const BinaryBasicBlock *DstBB = E.Dst;
|
||||
|
||||
// Initial weight value.
|
||||
int64_t W = (int64_t)E.Count;
|
||||
|
||||
// Adjust the weight by taking into account other edges with the same source.
|
||||
auto BI = SrcBB->branch_info_begin();
|
||||
for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
|
||||
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
|
||||
"attempted reordering blocks of function with no profile data");
|
||||
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
|
||||
"overflow detected");
|
||||
// Ignore edges with same source and destination, edges that target the
|
||||
// entry block as well as the edge E itself.
|
||||
if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
|
||||
W -= (int64_t)BI->Count;
|
||||
++BI;
|
||||
}
|
||||
|
||||
// Adjust the weight by taking into account other edges with the same
|
||||
// destination.
|
||||
for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
|
||||
// Ignore edges with same source and destination as well as the edge E
|
||||
// itself.
|
||||
if (PredBB == DstBB || PredBB == SrcBB)
|
||||
continue;
|
||||
auto BI = PredBB->branch_info_begin();
|
||||
for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
|
||||
if (SuccBB == DstBB)
|
||||
break;
|
||||
++BI;
|
||||
}
|
||||
assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
|
||||
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
|
||||
"attempted reordering blocks of function with no profile data");
|
||||
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
|
||||
"overflow detected");
|
||||
W -= (int64_t)BI->Count;
|
||||
}
|
||||
|
||||
return W;
|
||||
}
|
||||
|
||||
void MinBranchGreedyClusterAlgorithm::initQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
|
||||
// Initialize edge weights.
|
||||
for (const EdgeTy &E : Queue)
|
||||
Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
|
||||
|
||||
// Sort edges in increasing weight order.
|
||||
adjustQueue(Queue, BF);
|
||||
}
|
||||
|
||||
void MinBranchGreedyClusterAlgorithm::adjustQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
|
||||
// Define a comparison function to establish SWO between edges.
|
||||
auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
|
||||
// With equal weights, prioritize branches with lower index
|
||||
// source/destination. This helps to keep original block order for blocks
|
||||
// when optimal order cannot be deduced from a profile.
|
||||
if (Weight[A] == Weight[B]) {
|
||||
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
|
||||
return (SrcOrder != 0)
|
||||
? SrcOrder > 0
|
||||
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
|
||||
}
|
||||
return Weight[A] < Weight[B];
|
||||
};
|
||||
|
||||
// Iterate through all remaining edges to find edges that have their
|
||||
// source and destination in the same cluster.
|
||||
std::vector<EdgeTy> NewQueue;
|
||||
for (const EdgeTy &E : Queue) {
|
||||
const auto *SrcBB = E.Src;
|
||||
const auto *DstBB = E.Dst;
|
||||
|
||||
// Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
|
||||
// this edge.
|
||||
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
|
||||
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
|
||||
E.print(dbgs());
|
||||
dbgs() << " (same src, dst)\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
int I = BBToClusterMap[SrcBB];
|
||||
int J = BBToClusterMap[DstBB];
|
||||
auto &ClusterA = Clusters[I];
|
||||
auto &ClusterB = Clusters[J];
|
||||
|
||||
// Case 2: They are already allocated at the same cluster or incompatible
|
||||
// clusters. Adjust the weights of edges with the same source or
|
||||
// destination, so that this edge has no effect on them any more, and ignore
|
||||
// this edge. Also increase the intra- (or inter-) cluster edge count.
|
||||
if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
|
||||
if (!ClusterEdges.empty())
|
||||
ClusterEdges[I][J] += E.Count;
|
||||
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
|
||||
E.print(dbgs());
|
||||
dbgs() << " (src, dst belong to same cluster or incompatible "
|
||||
"clusters)\n");
|
||||
for (const auto *SuccBB : SrcBB->successors()) {
|
||||
if (SuccBB == DstBB)
|
||||
continue;
|
||||
auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
|
||||
assert(WI != Weight.end() && "CFG edge not found in Weight map");
|
||||
WI->second += (int64_t)E.Count;
|
||||
}
|
||||
for (const auto *PredBB : DstBB->predecessors()) {
|
||||
if (PredBB == SrcBB)
|
||||
continue;
|
||||
auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
|
||||
assert(WI != Weight.end() && "CFG edge not found in Weight map");
|
||||
WI->second += (int64_t)E.Count;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Case 3: None of the previous cases is true, so just keep this edge in
|
||||
// the queue.
|
||||
NewQueue.emplace_back(E);
|
||||
}
|
||||
|
||||
// Sort remaining edges in increasing weight order.
|
||||
Queue.swap(NewQueue);
|
||||
std::sort(Queue.begin(), Queue.end(), Comp);
|
||||
}
|
||||
|
||||
bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
|
||||
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
|
||||
return Front.back() == E.Src && Back.front() == E.Dst;
|
||||
}
|
||||
|
||||
void MinBranchGreedyClusterAlgorithm::reset() {
|
||||
GreedyClusterAlgorithm::reset();
|
||||
Weight.clear();
|
||||
}
|
||||
|
||||
void OptimalReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
std::vector<std::vector<uint64_t>> Weight;
|
||||
std::unordered_map<const BinaryBasicBlock *, int> BBToIndex;
|
||||
std::vector<BinaryBasicBlock *> IndexToBB;
|
||||
|
||||
unsigned N = BF.layout_size();
|
||||
// Populating weight map and index map
|
||||
for (auto BB : BF.layout()) {
|
||||
BBToIndex[BB] = IndexToBB.size();
|
||||
IndexToBB.push_back(BB);
|
||||
}
|
||||
Weight.resize(N);
|
||||
for (auto BB : BF.layout()) {
|
||||
auto BI = BB->branch_info_begin();
|
||||
Weight[BBToIndex[BB]].resize(N);
|
||||
for (auto I : BB->successors()) {
|
||||
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
|
||||
Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
|
||||
++BI;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> DP;
|
||||
DP.resize(1 << N);
|
||||
for (auto &Elmt : DP) {
|
||||
Elmt.resize(N, -1);
|
||||
}
|
||||
// Start with the entry basic block being allocated with cost zero
|
||||
DP[1][0] = 0;
|
||||
// Walk through TSP solutions using a bitmask to represent state (current set
|
||||
// of BBs in the layout)
|
||||
unsigned BestSet = 1;
|
||||
unsigned BestLast = 0;
|
||||
int64_t BestWeight = 0;
|
||||
for (unsigned Set = 1; Set < (1U << N); ++Set) {
|
||||
// Traverse each possibility of Last BB visited in this layout
|
||||
for (unsigned Last = 0; Last < N; ++Last) {
|
||||
// Case 1: There is no possible layout with this BB as Last
|
||||
if (DP[Set][Last] == -1)
|
||||
continue;
|
||||
|
||||
// Case 2: There is a layout with this Set and this Last, and we try
|
||||
// to expand this set with New
|
||||
for (unsigned New = 1; New < N; ++New) {
|
||||
// Case 2a: BB "New" is already in this Set
|
||||
if ((Set & (1 << New)) != 0)
|
||||
continue;
|
||||
|
||||
// Case 2b: BB "New" is not in this set and we add it to this Set and
|
||||
// record total weight of this layout with "New" as the last BB.
|
||||
unsigned NewSet = (Set | (1 << New));
|
||||
if (DP[NewSet][New] == -1)
|
||||
DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
|
||||
DP[NewSet][New] = std::max(DP[NewSet][New],
|
||||
DP[Set][Last] + (int64_t)Weight[Last][New]);
|
||||
|
||||
if (DP[NewSet][New] > BestWeight) {
|
||||
BestWeight = DP[NewSet][New];
|
||||
BestSet = NewSet;
|
||||
BestLast = New;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Define final function layout based on layout that maximizes weight
|
||||
unsigned Last = BestLast;
|
||||
unsigned Set = BestSet;
|
||||
std::vector<bool> Visited;
|
||||
Visited.resize(N);
|
||||
Visited[Last] = true;
|
||||
Order.push_back(IndexToBB[Last]);
|
||||
Set = Set & ~(1U << Last);
|
||||
while (Set != 0) {
|
||||
int64_t Best = -1;
|
||||
for (unsigned I = 0; I < N; ++I) {
|
||||
if (DP[Set][I] == -1)
|
||||
continue;
|
||||
if (DP[Set][I] > Best) {
|
||||
Last = I;
|
||||
Best = DP[Set][I];
|
||||
}
|
||||
}
|
||||
Visited[Last] = true;
|
||||
Order.push_back(IndexToBB[Last]);
|
||||
Set = Set & ~(1U << Last);
|
||||
}
|
||||
std::reverse(Order.begin(), Order.end());
|
||||
|
||||
// Finalize layout with BBs that weren't assigned to the layout
|
||||
for (auto BB : BF.layout()) {
|
||||
if (Visited[BBToIndex[BB]] == false)
|
||||
Order.push_back(BB);
|
||||
}
|
||||
}
|
||||
|
||||
void OptimizeReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Arrange basic blocks according to clusters.
|
||||
for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
|
||||
void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
|
||||
auto &ClusterEdges = CAlgo->ClusterEdges;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Cluster layout order
|
||||
std::vector<uint32_t> ClusterOrder;
|
||||
|
||||
// Do a topological sort for clusters, prioritizing frequently-executed BBs
|
||||
// during the traversal.
|
||||
std::stack<uint32_t> Stack;
|
||||
std::vector<uint32_t> Status;
|
||||
std::vector<uint32_t> Parent;
|
||||
Status.resize(Clusters.size(), 0);
|
||||
Parent.resize(Clusters.size(), 0);
|
||||
constexpr uint32_t STACKED = 1;
|
||||
constexpr uint32_t VISITED = 2;
|
||||
Status[0] = STACKED;
|
||||
Stack.push(0);
|
||||
while (!Stack.empty()) {
|
||||
uint32_t I = Stack.top();
|
||||
if (!(Status[I] & VISITED)) {
|
||||
Status[I] |= VISITED;
|
||||
// Order successors by weight
|
||||
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
|
||||
return ClusterEdges[I][A] > ClusterEdges[I][B];
|
||||
};
|
||||
std::priority_queue<uint32_t, std::vector<uint32_t>,
|
||||
decltype(ClusterComp)> SuccQueue(ClusterComp);
|
||||
for (auto &Target: ClusterEdges[I]) {
|
||||
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
|
||||
!Clusters[Target.first].empty()) {
|
||||
Parent[Target.first] = I;
|
||||
Status[Target.first] = STACKED;
|
||||
SuccQueue.push(Target.first);
|
||||
}
|
||||
}
|
||||
while (!SuccQueue.empty()) {
|
||||
Stack.push(SuccQueue.top());
|
||||
SuccQueue.pop();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Already visited this node
|
||||
Stack.pop();
|
||||
ClusterOrder.push_back(I);
|
||||
}
|
||||
std::reverse(ClusterOrder.begin(), ClusterOrder.end());
|
||||
// Put unreachable clusters at the end
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!(Status[I] & VISITED) && !Clusters[I].empty())
|
||||
ClusterOrder.push_back(I);
|
||||
|
||||
// Sort nodes with equal precedence
|
||||
auto Beg = ClusterOrder.begin();
|
||||
// Don't reorder the first cluster, which contains the function entry point
|
||||
++Beg;
|
||||
std::stable_sort(Beg, ClusterOrder.end(),
|
||||
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
|
||||
uint32_t P = Parent[A];
|
||||
while (Parent[P] != 0) {
|
||||
if (Parent[P] == B)
|
||||
return false;
|
||||
P = Parent[P];
|
||||
}
|
||||
P = Parent[B];
|
||||
while (Parent[P] != 0) {
|
||||
if (Parent[P] == A)
|
||||
return true;
|
||||
P = Parent[P];
|
||||
}
|
||||
return AvgFreq[A] > AvgFreq[B];
|
||||
});
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
errs() << "New cluster order: ";
|
||||
auto Sep = "";
|
||||
for (auto O : ClusterOrder) {
|
||||
errs() << Sep << O;
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << '\n';
|
||||
}
|
||||
|
||||
// Arrange basic blocks according to cluster order.
|
||||
for (uint32_t ClusterIndex : ClusterOrder) {
|
||||
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
}
|
||||
|
||||
void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Cluster layout order
|
||||
std::vector<uint32_t> ClusterOrder;
|
||||
|
||||
// Order clusters based on average instruction execution frequency
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!Clusters[I].empty())
|
||||
ClusterOrder.push_back(I);
|
||||
// Don't reorder the first cluster, which contains the function entry point
|
||||
std::stable_sort(std::next(ClusterOrder.begin()),
|
||||
ClusterOrder.end(),
|
||||
[&AvgFreq](uint32_t A, uint32_t B) {
|
||||
return AvgFreq[A] > AvgFreq[B];
|
||||
});
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
errs() << "New cluster order: ";
|
||||
auto Sep = "";
|
||||
for (auto O : ClusterOrder) {
|
||||
errs() << Sep << O;
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << '\n';
|
||||
}
|
||||
|
||||
// Arrange basic blocks according to cluster order.
|
||||
for (uint32_t ClusterIndex : ClusterOrder) {
|
||||
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
}
|
||||
|
||||
void ReverseReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
auto FirstBB = *BF.layout_begin();
|
||||
Order.push_back(FirstBB);
|
||||
for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
|
||||
Order.push_back(*RLI);
|
||||
}
|
||||
|
||||
|
||||
void RandomClusterReorderAlgorithm::reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
CAlgo->printClusters();
|
||||
|
||||
// Cluster layout order
|
||||
std::vector<uint32_t> ClusterOrder;
|
||||
|
||||
// Order clusters based on average instruction execution frequency
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
|
||||
if (!Clusters[I].empty())
|
||||
ClusterOrder.push_back(I);
|
||||
|
||||
std::srand(opts::RandomSeed);
|
||||
std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end());
|
||||
|
||||
if (opts::PrintClusters) {
|
||||
errs() << "New cluster order: ";
|
||||
auto Sep = "";
|
||||
for (auto O : ClusterOrder) {
|
||||
errs() << Sep << O;
|
||||
Sep = ", ";
|
||||
}
|
||||
errs() << '\n';
|
||||
}
|
||||
|
||||
// Arrange basic blocks according to cluster order.
|
||||
for (uint32_t ClusterIndex : ClusterOrder) {
|
||||
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
|
||||
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,268 @@
|
|||
// Passes/ReorderAlgorithm.h - Interface for basic block reorderng algorithms //
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Interface to different basic block reordering algorithms.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
|
||||
|
||||
#include "BinaryFunction.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class raw_ostream;
|
||||
|
||||
|
||||
namespace bolt {
|
||||
|
||||
class BinaryBasicBlock;
|
||||
class BinaryFunction;
|
||||
|
||||
/// Objects of this class implement various basic block clustering algorithms.
|
||||
/// Basic block clusters are chains of basic blocks that should be laid out
|
||||
/// in this order to maximize performace. These algorithms group basic blocks
|
||||
/// into clusters using execution profile data and various heuristics.
|
||||
class ClusterAlgorithm {
|
||||
public:
|
||||
using ClusterTy = std::vector<BinaryBasicBlock *>;
|
||||
std::vector<ClusterTy> Clusters;
|
||||
std::vector<std::unordered_map<uint32_t, uint64_t>> ClusterEdges;
|
||||
std::vector<double> AvgFreq;
|
||||
|
||||
/// Group the basic blocks in the given function into clusters stored in the
|
||||
/// Clusters vector. Also encode relative weights between two clusters in
|
||||
/// the ClusterEdges vector if requested. This vector is indexed by
|
||||
/// the clusters indices in the Clusters vector.
|
||||
virtual void clusterBasicBlocks(const BinaryFunction &BF,
|
||||
bool ComputeEdges = false) = 0;
|
||||
|
||||
/// Compute for each cluster its averagae execution frequency, that is
|
||||
/// the sum of average frequencies of its blocks (execution count / # instrs).
|
||||
/// The average frequencies are stored in the AvgFreq vector, index by the
|
||||
/// cluster indices in the Clusters vector.
|
||||
void computeClusterAverageFrequency();
|
||||
|
||||
/// Clear clusters and related info.
|
||||
virtual void reset();
|
||||
|
||||
void printClusters() const;
|
||||
|
||||
virtual ~ClusterAlgorithm() {}
|
||||
};
|
||||
|
||||
/// Base class for a greedy clustering algorithm that selects edges in order
|
||||
/// based on some heuristic and uses them to join basic blocks into clusters.
|
||||
class GreedyClusterAlgorithm : public ClusterAlgorithm {
|
||||
protected:
|
||||
// Represents an edge between two basic blocks, with source, destination, and
|
||||
// profile count.
|
||||
struct EdgeTy {
|
||||
const BinaryBasicBlock *Src;
|
||||
const BinaryBasicBlock *Dst;
|
||||
uint64_t Count;
|
||||
|
||||
EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst,
|
||||
uint64_t Count) :
|
||||
Src(Src), Dst(Dst), Count(Count) {}
|
||||
|
||||
void print(raw_ostream &OS) const;
|
||||
};
|
||||
|
||||
struct EdgeHash {
|
||||
size_t operator() (const EdgeTy &E) const;
|
||||
};
|
||||
|
||||
struct EdgeEqual {
|
||||
bool operator() (const EdgeTy &A, const EdgeTy &B) const;
|
||||
};
|
||||
|
||||
// Virtual methods that allow custom specialization of the heuristic used by
|
||||
// the algorithm to select edges.
|
||||
virtual void initQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
|
||||
virtual void adjustQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
|
||||
virtual bool areClustersCompatible(
|
||||
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0;
|
||||
|
||||
// Map from basic block to owning cluster index.
|
||||
using BBToClusterMapTy = std::unordered_map<const BinaryBasicBlock *,
|
||||
unsigned>;
|
||||
BBToClusterMapTy BBToClusterMap;
|
||||
|
||||
public:
|
||||
void clusterBasicBlocks(const BinaryFunction &BF,
|
||||
bool ComputeEdges = false) override;
|
||||
void reset() override;
|
||||
};
|
||||
|
||||
|
||||
/// This clustering algorithm is based on a greedy heuristic suggested by
|
||||
/// Pettis and Hansen (PLDI '90).
|
||||
class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
|
||||
protected:
|
||||
void initQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
||||
void adjustQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
||||
bool areClustersCompatible(
|
||||
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
|
||||
override;
|
||||
};
|
||||
|
||||
|
||||
/// This clustering algorithm is based on a greedy heuristic that is a
|
||||
/// modification of the heuristic suggested by Pettis (PLDI '90). It is
|
||||
/// geared towards minimizing branches.
|
||||
class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
|
||||
private:
|
||||
// Map from an edge to its weight which is used by the algorithm to sort the
|
||||
// edges.
|
||||
std::unordered_map<EdgeTy, int64_t, EdgeHash, EdgeEqual> Weight;
|
||||
|
||||
// The weight of an edge is calculated as the win in branches if we choose
|
||||
// to layout this edge as a fall-through. For example, consider the edges
|
||||
// A -> B with execution count 500,
|
||||
// A -> C with execution count 100, and
|
||||
// D -> B with execution count 150
|
||||
// wher B, C are the only successors of A and A, D are thr only predessecors
|
||||
// of B. Then if we choose to layout edge A -> B as a fallthrough, the win in
|
||||
// branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B.
|
||||
int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const;
|
||||
|
||||
protected:
|
||||
void initQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
||||
void adjustQueue(
|
||||
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
||||
bool areClustersCompatible(
|
||||
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
|
||||
override;
|
||||
|
||||
public:
|
||||
void reset() override;
|
||||
};
|
||||
|
||||
|
||||
/// Objects of this class implement various basic block reordering alogrithms.
|
||||
/// Most of these algorithms depend on a clustering alogrithm.
|
||||
/// Here we have 3 conflicting goals as to how to layout clusters. If we want
|
||||
/// to minimize jump offsets, we should put clusters with heavy inter-cluster
|
||||
/// dependence as close as possible. If we want to maximize the probability
|
||||
/// that all inter-cluster edges are predicted as not-taken, we should enforce
|
||||
/// a topological order to make targets appear after sources, creating forward
|
||||
/// branches. If we want to separate hot from cold blocks to maximize the
|
||||
/// probability that unfrequently executed code doesn't pollute the cache, we
|
||||
/// should put clusters in descending order of hotness.
|
||||
class ReorderAlgorithm {
|
||||
protected:
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo;
|
||||
|
||||
public:
|
||||
ReorderAlgorithm() { }
|
||||
explicit ReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
CAlgo(std::move(CAlgo)) { }
|
||||
|
||||
using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
|
||||
|
||||
/// Reorder the basic blocks of the given function and store the new order in
|
||||
/// the new Clusters vector.
|
||||
virtual void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const = 0;
|
||||
|
||||
void setClusterAlgorithm(ClusterAlgorithm *CAlgo) {
|
||||
this->CAlgo.reset(CAlgo);
|
||||
}
|
||||
|
||||
virtual ~ReorderAlgorithm() { }
|
||||
};
|
||||
|
||||
|
||||
/// Dynamic programming implementation for the TSP, applied to BB layout. Find
|
||||
/// the optimal way to maximize weight during a path traversing all BBs. In
|
||||
/// this way, we will convert the hottest branches into fall-throughs.
|
||||
///
|
||||
/// Uses exponential amount of memory on the number of basic blocks and should
|
||||
/// only be used for small functions.
|
||||
class OptimalReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// Simple algorithm that groups basic blocks into clusters and then
|
||||
/// lays them out cluster after cluster.
|
||||
class OptimizeReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit OptimizeReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// This reorder algorithm tries to ensure that all inter-cluster edges are
|
||||
/// predicted as not-taken, by enforcing a topological order to make
|
||||
/// targets appear after sources, creating forward branches.
|
||||
class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit OptimizeBranchReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// This reorder tries to separate hot from cold blocks to maximize the
|
||||
/// probability that unfrequently executed code doesn't pollute the cache, by
|
||||
/// putting clusters in descending order of hotness.
|
||||
class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit OptimizeCacheReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
|
||||
/// Toy example that simply reverses the original basic block order.
|
||||
class ReverseReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
/// Create clusters as usual and place them in random order.
|
||||
class RandomClusterReorderAlgorithm : public ReorderAlgorithm {
|
||||
public:
|
||||
explicit RandomClusterReorderAlgorithm(
|
||||
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
||||
ReorderAlgorithm(std::move(CAlgo)) { }
|
||||
|
||||
void reorderBasicBlocks(
|
||||
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue