Introduce pass to reduce jump tables footprint

Summary:
Add a pass to identify indirect jumps to jump tables and reduce
their entries size from 8 to 4 bytes. For PIC jump tables, it will
convert the PIC code to non-PIC (since BOLT only processes static code,
it makes no sense to use expensive PIC-style jumps in static code). Add
corresponding improvements to register scavenging pass and add a MCInst
matcher machinery.

(cherry picked from FBD6421582)
This commit is contained in:
Rafael Auler 2017-11-02 00:30:11 -07:00 committed by Maksim Panchenko
parent 39a8c36697
commit 21eb2139ee
8 changed files with 453 additions and 10 deletions

View File

@ -3826,7 +3826,7 @@ uint64_t BinaryFunction::JumpTable::emit(MCStreamer *Streamer,
LastLabel = LI->second;
}
if (Type == JTT_NORMAL) {
Streamer->EmitSymbolValue(Entry, EntrySize);
Streamer->EmitSymbolValue(Entry, OutputEntrySize);
} else { // JTT_PIC
auto JT = MCSymbolRefExpr::create(LastLabel, Streamer->getContext());
auto E = MCSymbolRefExpr::create(Entry, Streamer->getContext());

View File

@ -538,6 +538,9 @@ public:
/// Size of the entry used for storage.
std::size_t EntrySize;
/// Size of the entry size we will write (we may use a more compact layout)
std::size_t OutputEntrySize;
/// The type of this jump table.
JumpTableType Type;
@ -567,14 +570,11 @@ public:
std::pair<size_t, size_t> getEntriesForAddress(const uint64_t Addr) const;
/// Constructor.
JumpTable(uint64_t Address,
std::size_t EntrySize,
JumpTableType Type,
JumpTable(uint64_t Address, std::size_t EntrySize, JumpTableType Type,
decltype(OffsetEntries) &&OffsetEntries,
decltype(Labels) &&Labels)
: Address(Address), EntrySize(EntrySize), Type(Type),
OffsetEntries(OffsetEntries), Labels(Labels)
{}
: Address(Address), EntrySize(EntrySize), OutputEntrySize(EntrySize),
Type(Type), OffsetEntries(OffsetEntries), Labels(Labels) {}
/// Dynamic number of times each entry in the table was referenced.
/// Identical entries will have a shared count (identical for every
@ -1275,6 +1275,11 @@ public:
return getJumpTableContainingAddress(Address);
}
JumpTable *getJumpTable(const MCInst &Inst) {
const auto Address = BC.MIA->getJumpTable(Inst);
return getJumpTableContainingAddress(Address);
}
const MCSymbol *getPersonalityFunction() const {
return PersonalityFunction;
}

View File

@ -16,6 +16,7 @@
#include "Passes/IndirectCallPromotion.h"
#include "Passes/Inliner.h"
#include "Passes/LongJmp.h"
#include "Passes/JTFootprintReduction.h"
#include "Passes/PLTCall.h"
#include "Passes/ReorderFunctions.h"
#include "Passes/StokeInfo.h"
@ -62,6 +63,19 @@ InlineSmallFunctions("inline-small-functions",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
JTFootprintReductionFlag("jt-footprint-reduction",
cl::desc("make jump tables size smaller at the cost of using more "
"instructions at jump sites"),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
PrintJTFootprintReduction("print-after-jt-footprint-reduction",
cl::desc("print function after jt-footprint-reduction pass"),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
NeverPrint("never-print",
cl::desc("never print"),
@ -328,6 +342,10 @@ void BinaryFunctionPassManager::runAllPasses(
Manager.registerPass(llvm::make_unique<Peepholes>(PrintPeepholes));
Manager.registerPass(
llvm::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
opts::JTFootprintReductionFlag);
Manager.registerPass(llvm::make_unique<InlineSmallFunctions>(PrintInline),
opts::InlineSmallFunctions);

View File

@ -13,6 +13,7 @@ add_llvm_library(LLVMBOLTPasses
HFSortPlus.cpp
IndirectCallPromotion.cpp
Inliner.cpp
JTFootprintReduction.cpp
LivenessAnalysis.cpp
LongJmp.cpp
MCF.cpp

View File

@ -37,6 +37,19 @@ void doForAllSuccs(const BinaryBasicBlock &BB,
}
void RegStatePrinter::print(raw_ostream &OS, const BitVector &State) const {
if (State.all()) {
OS << "(all)";
return;
}
if (State.count() > (State.size() >> 1)) {
OS << "all, except: ";
auto BV = State;
BV.flip();
for (auto I = BV.find_first(); I != -1; I = BV.find_next(I)) {
OS << BC.MRI->getName(I) << " ";
}
return;
}
for (auto I = State.find_first(); I != -1; I = State.find_next(I)) {
OS << BC.MRI->getName(I) << " ";
}

View File

@ -0,0 +1,276 @@
//===--- JTFootprintReduction.cpp -----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "JTFootprintReduction.h"
#include "llvm/Support/Options.h"
#define DEBUG_TYPE "JT"
using namespace llvm;
using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<unsigned> Verbosity;
extern cl::opt<bool> Relocs;
extern bool shouldProcess(const bolt::BinaryFunction &Function);
extern cl::opt<JumpTableSupportLevel> JumpTables;
} // namespace opts
namespace llvm {
namespace bolt {
void JTFootprintReduction::checkOpportunities(BinaryContext &BC,
BinaryFunction &Function,
DataflowInfoManager &Info) {
std::map<BinaryFunction::JumpTable *, uint64_t> AllJTs;
for (auto &BB : Function) {
for (auto &Inst : BB) {
auto *JumpTable = Function.getJumpTable(Inst);
if (!JumpTable)
continue;
AllJTs[JumpTable] += BB.getKnownExecutionCount();
++IndJmps;
if (BlacklistedJTs.count(JumpTable))
continue;
uint64_t Scale;
// Try a standard indirect jump matcher
auto IndJmpMatcher = BC.MIA->matchIndJmp(
BC.MIA->matchAnyOperand(), BC.MIA->matchImm(Scale),
BC.MIA->matchReg(), BC.MIA->matchAnyOperand());
if (IndJmpMatcher->match(*BC.MRI, *BC.MIA,
MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1),
-1) &&
Scale == 8) {
if (Info.getLivenessAnalysis().scavengeRegAfter(&Inst))
continue;
BlacklistedJTs.insert(JumpTable);
++IndJmpsDenied;
++NumJTsNoReg;
continue;
}
// Try a PIC matcher. The pattern we are looking for is a PIC JT ind jmp:
// addq %rdx, %rsi
// addq %rdx, %rdi
// leaq DATAat0x402450(%rip), %r11
// movslq (%r11,%rdx,4), %rcx
// addq %r11, %rcx
// jmpq *%rcx # JUMPTABLE @0x402450
MCPhysReg BaseReg1;
MCPhysReg BaseReg2;
uint64_t Offset;
auto PICIndJmpMatcher = BC.MIA->matchIndJmp(BC.MIA->matchAdd(
BC.MIA->matchReg(BaseReg1),
BC.MIA->matchLoad(BC.MIA->matchReg(BaseReg2), BC.MIA->matchImm(Scale),
BC.MIA->matchReg(), BC.MIA->matchImm(Offset))));
auto PICBaseAddrMatcher = BC.MIA->matchIndJmp(
BC.MIA->matchAdd(BC.MIA->matchLoadAddr(BC.MIA->matchSymbol()),
BC.MIA->matchAnyOperand()));
if (!PICIndJmpMatcher->match(
*BC.MRI, *BC.MIA,
MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1), -1) ||
Scale != 4 || BaseReg1 != BaseReg2 || Offset != 0 ||
!PICBaseAddrMatcher->match(
*BC.MRI, *BC.MIA,
MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1), -1)) {
BlacklistedJTs.insert(JumpTable);
++IndJmpsDenied;
++NumJTsBadMatch;
continue;
}
}
}
// Statistics only
for (const auto &JTFreq : AllJTs) {
auto *JT = JTFreq.first;
uint64_t CurScore = JTFreq.second;
TotalJTScore += CurScore;
if (!BlacklistedJTs.count(JT)) {
OptimizedScore += CurScore;
if (JT->EntrySize == 8)
BytesSaved += JT->getSize() >> 1;
}
}
TotalJTs += AllJTs.size();
TotalJTsDenied += BlacklistedJTs.size();
}
bool JTFootprintReduction::tryOptimizeNonPIC(
BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr,
BinaryFunction::JumpTable *JumpTable, DataflowInfoManager &Info) {
MCOperand Base;
uint64_t Scale;
MCPhysReg Index;
MCOperand Offset;
auto IndJmpMatcher = BC.MIA->matchIndJmp(
BC.MIA->matchAnyOperand(Base), BC.MIA->matchImm(Scale),
BC.MIA->matchReg(Index), BC.MIA->matchAnyOperand(Offset));
if (!IndJmpMatcher->match(*BC.MRI, *BC.MIA,
MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1),
-1)) {
return false;
}
assert(Scale == 8 && "Wrong scale");
Scale = 4;
IndJmpMatcher->annotate(*BC.MIA, *BC.Ctx.get(), "DeleteMe");
auto &LA = Info.getLivenessAnalysis();
MCPhysReg Reg = LA.scavengeRegAfter(&Inst);
assert(Reg != 0 && "Register scavenger failed!");
auto RegOp = MCOperand::createReg(Reg);
SmallVector<MCInst, 4> NewFrag;
BC.MIA->createIJmp32Frag(NewFrag, Base, MCOperand::createImm(Scale),
MCOperand::createReg(Index), Offset, RegOp);
BC.MIA->setJumpTable(BC.Ctx.get(), NewFrag.back(), JTAddr, Index);
JumpTable->OutputEntrySize = 4;
BB.replaceInstruction(&Inst, NewFrag.begin(), NewFrag.end());
return true;
}
bool JTFootprintReduction::tryOptimizePIC(
BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr,
BinaryFunction::JumpTable *JumpTable, DataflowInfoManager &Info) {
MCPhysReg BaseReg;
uint64_t Scale;
MCPhysReg Index;
MCOperand Offset;
MCOperand JumpTableRef;
auto PICIndJmpMatcher = BC.MIA->matchIndJmp(BC.MIA->matchAdd(
BC.MIA->matchLoadAddr(BC.MIA->matchAnyOperand(JumpTableRef)),
BC.MIA->matchLoad(BC.MIA->matchReg(BaseReg), BC.MIA->matchImm(Scale),
BC.MIA->matchReg(Index), BC.MIA->matchAnyOperand())));
if (!PICIndJmpMatcher->match(*BC.MRI, *BC.MIA,
MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1),
-1)) {
return false;
}
assert(Scale == 4 && "Wrong scale");
PICIndJmpMatcher->annotate(*BC.MIA, *BC.Ctx.get(), "DeleteMe");
auto RegOp = MCOperand::createReg(BaseReg);
SmallVector<MCInst, 4> NewFrag;
BC.MIA->createIJmp32Frag(NewFrag, MCOperand::createReg(0),
MCOperand::createImm(Scale),
MCOperand::createReg(Index), JumpTableRef, RegOp);
BC.MIA->setJumpTable(BC.Ctx.get(), NewFrag.back(), JTAddr, Index);
JumpTable->OutputEntrySize = 4;
// DePICify
JumpTable->Type = BinaryFunction::JumpTable::JTT_NORMAL;
BB.replaceInstruction(&Inst, NewFrag.begin(), NewFrag.end());
return true;
}
void JTFootprintReduction::optimizeFunction(BinaryContext &BC,
BinaryFunction &Function,
DataflowInfoManager &Info) {
for (auto &BB : Function) {
if (!BB.getNumNonPseudos())
continue;
MCInst &IndJmp = *BB.getLastNonPseudo();
uint64_t JTAddr = BC.MIA->getJumpTable(IndJmp);
if (!JTAddr)
continue;
auto *JumpTable = Function.getJumpTable(IndJmp);
if (BlacklistedJTs.count(JumpTable))
continue;
if (tryOptimizeNonPIC(BC, BB, IndJmp, JTAddr, JumpTable, Info)
|| tryOptimizePIC(BC, BB, IndJmp, JTAddr, JumpTable, Info)) {
Modified.insert(&Function);
continue;
}
llvm_unreachable("Should either optimize PIC or NonPIC successfuly");
}
if (!Modified.count(&Function))
return;
for (auto &BB : Function) {
for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
if (BC.MIA->hasAnnotation(*I, "DeleteMe"))
BB.eraseInstruction(&*I);
}
}
}
void JTFootprintReduction::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions
) {
if (opts::JumpTables == JTS_BASIC && opts::Relocs)
return;
BinaryFunctionCallGraph CG(buildCallGraph(BC, BFs));
RegAnalysis RA(BC, BFs, CG);
for (auto &BFIt : BFs) {
auto &Function = BFIt.second;
if (!Function.isSimple() || !opts::shouldProcess(Function))
continue;
if (Function.getKnownExecutionCount() == 0)
continue;
DataflowInfoManager Info(BC, Function, &RA, nullptr);
BlacklistedJTs.clear();
checkOpportunities(BC, Function, Info);
optimizeFunction(BC, Function, Info);
}
if (TotalJTs == TotalJTsDenied) {
outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
return;
}
outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
if (OptimizedScore) {
outs() << format("\t %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore))
<< " of dynamic JT entries were reduced.\n";
}
outs() << "\t " << TotalJTs - TotalJTsDenied << " of " << TotalJTs
<< " jump tables affected.\n";
outs() << "\t " << IndJmps - IndJmpsDenied << " of " << IndJmps
<< " indirect jumps to JTs affected.\n";
outs() << "\t " << NumJTsBadMatch
<< " JTs discarded due to unsupported jump pattern.\n";
outs() << "\t " << NumJTsNoReg
<< " JTs discarded due to register unavailability.\n";
outs() << "\t " << BytesSaved
<< " bytes saved.\n";
}
} // namespace bolt
} // namespace llvm

View File

@ -0,0 +1,85 @@
//===--- JTFootprintReduction.h -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Jump table footprint reduction pass
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_JT_FOOTPRINT_REDUCTION_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_JT_FOOTPRINT_REDUCTION_H
#include "BinaryPasses.h"
#include "DataflowInfoManager.h"
#include "DataReader.h"
namespace llvm {
namespace bolt {
/// This pass identify indirect jumps to jump tables and reduce their entries
/// size from 8 to 4 bytes. For PIC jump tables, it will remove the PIC code
/// (since BOLT only process static code and it makes no sense to use expensive
/// PIC-style jumps in static code).
class JTFootprintReduction : public BinaryFunctionPass {
uint64_t TotalJTScore{0};
uint64_t TotalJTs{0};
uint64_t TotalJTsDenied{0};
uint64_t OptimizedScore{0};
uint64_t IndJmps{0};
uint64_t IndJmpsDenied{0};
uint64_t NumJTsBadMatch{0};
uint64_t NumJTsNoReg{0};
uint64_t BytesSaved{0};
DenseSet<BinaryFunction::JumpTable *> BlacklistedJTs;
DenseSet<const BinaryFunction *> Modified;
/// Check if \p Function presents jump tables where all jump locations can
/// be safely changed to use a different code sequence. If this is true, we
/// will be able to emit the whole table with a smaller entry size.
void checkOpportunities(BinaryContext &BC, BinaryFunction &Function,
DataflowInfoManager &Info);
/// The Non-PIC jump table optimization consists of reducing the jump table
/// entry size from 8 to 4 bytes. For that, we need to change the jump code
/// sequence from a single jmp * instruction to a pair of load32zext-jmp
/// instructions that depend on the availability of an extra register.
/// This saves dcache/dTLB at the expense of icache.
bool tryOptimizeNonPIC(BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst,
uint64_t JTAddr, BinaryFunction::JumpTable *JumpTable,
DataflowInfoManager &Info);
/// The PIC jump table optimization consists of "de-pic-ifying" it, since the
/// PIC jump sequence is larger than its non-PIC counterpart, saving icache.
bool tryOptimizePIC(BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst,
uint64_t JTAddr, BinaryFunction::JumpTable *JumpTable,
DataflowInfoManager &Info);
/// Run a pass for \p Function
void optimizeFunction(BinaryContext &BC, BinaryFunction &Function,
DataflowInfoManager &Info);
public:
explicit JTFootprintReduction(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) { }
/// BinaryPass interface functions
const char *getName() const override {
return "jt-footprint-reduction";
}
bool shouldPrint(const BinaryFunction &BF) const override {
return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
}
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt
} // namespace llvm
#endif

View File

@ -60,8 +60,13 @@ public:
BitVector BV = *this->getStateAt(P);
BV.flip();
BitVector GPRegs(NumRegs, false);
this->BC.MIA->getGPRegs(GPRegs);
this->BC.MIA->getGPRegs(GPRegs, /*IncludeAlias=*/false);
// Ignore the register used for frame pointer even if it is not alive (it
// may be used by CFI which is not represented in our dataflow).
auto FP = BC.MIA->getAliases(BC.MIA->getFramePointer());
FP.flip();
BV &= GPRegs;
BV &= FP;
int Reg = BV.find_first();
return Reg != -1 ? Reg : 0;
}
@ -74,6 +79,19 @@ protected:
void preflight() {}
BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
// Entry points start with default live out (registers used as return
// values).
if (BB.succ_size() == 0) {
BitVector State(NumRegs, false);
if (opts::AssumeABI) {
BC.MIA->getDefaultLiveOut(State);
BC.MIA->getCalleeSavedRegs(State);
} else {
State.set();
State.reset(BC.MIA->getFlagsReg());
}
return State;
}
return BitVector(NumRegs, false);
}
@ -100,7 +118,15 @@ protected:
// because we don't really know what's going on.
if (RA.isConservative(Written)) {
Written.reset();
BC.MIA->getCalleeSavedRegs(Written);
BC.MIA->getDefaultLiveOut(Written);
// If ABI is respected, everything except CSRs should be dead after a
// call
if (opts::AssumeABI) {
auto CSR = BitVector(NumRegs, false);
BC.MIA->getCalleeSavedRegs(CSR);
CSR.flip();
Written |= CSR;
}
}
}
Written.flip();
@ -108,7 +134,26 @@ protected:
// Gen
if (!this->BC.MIA->isCFI(Point)) {
auto Used = BitVector(NumRegs, false);
RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/false);
if (IsCall) {
RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/true);
if (RA.isConservative(Used)) {
Used = BC.MIA->getRegsUsedAsParams();
BC.MIA->getDefaultLiveOut(Used);
}
}
const auto InstInfo = BC.MII->get(Point.getOpcode());
for (unsigned I = 0, E = Point.getNumOperands(); I != E; ++I) {
if (!Point.getOperand(I).isReg() || I < InstInfo.getNumDefs())
continue;
Used |= BC.MIA->getAliases(Point.getOperand(I).getReg(),
/*OnlySmaller=*/false);
}
for (auto
I = InstInfo.getImplicitUses(),
E = InstInfo.getImplicitUses() + InstInfo.getNumImplicitUses();
I != E; ++I) {
Used |= BC.MIA->getAliases(*I, false);
}
if (IsCall &&
(!BC.MIA->isTailCall(Point) || !BC.MIA->isConditionalBranch(Point))) {
// Never gen FLAGS from a non-conditional call... this is overly