[BOLT] Move disassemble optimizations to optimization passes

Summary:
The patch moves the shortenInstructions and nop remove to separate binary
passes. As a result when llvm-bolt optimizations stage will begin the
instructions of the binary functions will be absolutely the same as it
was in the binary. This is needed for the golang support by llvm-bolt.
Some of the tests must be changed, since bb alignment nops might create
unreachable BBs in original functions.

Vladislav Khmelevsky,
Advanced Software Technology Lab, Huawei

(cherry picked from FBD32896517)
This commit is contained in:
Vladislav Khmelevsky 2021-12-18 17:03:35 -08:00 committed by Maksim Panchenko
parent 46e93fb427
commit 08f56926c2
12 changed files with 124 additions and 64 deletions

View File

@ -691,6 +691,11 @@ public:
return Instructions.erase(II);
}
/// Erase non-pseudo instruction at a given \p Index
void eraseInstructionAtIndex(unsigned Index) {
eraseInstruction(Instructions.begin() + Index);
}
/// Erase instructions in the specified range.
template <typename ItrType>
void eraseInstructions(ItrType Begin, ItrType End) {

View File

@ -1401,6 +1401,8 @@ public:
layout_front()->isCold() != layout_back()->isCold();
}
bool shouldPreserveNops() const { return PreserveNops; }
/// Return true if the function has exception handling tables.
bool hasEHRanges() const { return HasEHRanges; }

View File

@ -281,17 +281,25 @@ public:
void runOnFunctions(BinaryContext &BC) override;
};
/// Convert instructions to the form with the minimum operand width.
class ShortenInstructions : public BinaryFunctionPass {
uint64_t shortenInstructions(BinaryFunction &Function);
public:
explicit ShortenInstructions(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}
const char *getName() const override { return "shorten-instructions"; }
void runOnFunctions(BinaryContext &BC) override;
};
/// Perform simple peephole optimizations.
class Peepholes : public BinaryFunctionPass {
uint64_t NumShortened{0};
uint64_t NumDoubleJumps{0};
uint64_t TailCallTraps{0};
uint64_t NumUselessCondBranches{0};
/// Attempt to use the minimum operand width for arithmetic, branch and
/// move instructions.
uint64_t shortenInstructions(BinaryContext &BC, BinaryFunction &Function);
/// Add trap instructions immediately after indirect tail calls to prevent
/// the processor from decoding instructions immediate following the
/// tailcall.
@ -432,6 +440,20 @@ public:
void runOnFunctions(BinaryContext &BC) override;
};
/// Pass to remove nops in code
class RemoveNops : public BinaryFunctionPass {
void runOnFunction(BinaryFunction &Function);
public:
explicit RemoveNops(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}
const char *getName() const override { return "remove-nops"; }
/// Pass entry point
void runOnFunctions(BinaryContext &BC) override;
};
enum FrameOptimizationType : char {
FOP_NONE, /// Don't perform FOP.
FOP_HOT, /// Perform FOP on hot functions.

View File

@ -1316,10 +1316,6 @@ bool BinaryFunction::disassemble() {
}
}
// Convert instruction to a shorter version that could be relaxed if
// needed.
MIB->shortenInstruction(Instruction);
if (MIB->isBranch(Instruction) || MIB->isCall(Instruction)) {
uint64_t TargetAddress = 0;
if (MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size,
@ -1419,6 +1415,13 @@ add_instruction:
MIB->addAnnotation(Instruction, "Offset", static_cast<uint32_t>(Offset));
}
if (BC.MIB->isNoop(Instruction)) {
// NOTE: disassembly loses the correct size information for noops.
// E.g. nopw 0x0(%rax,%rax,1) is 9 bytes, but re-encoded it's only
// 5 bytes. Preserve the size info using annotations.
MIB->addAnnotation(Instruction, "Size", static_cast<uint32_t>(Size));
}
addInstruction(Offset, std::move(Instruction));
}
@ -2027,13 +2030,6 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
}
}
// Ignore nops except SDT markers. We use nops to derive alignment of the
// next basic block. It will not always work, as some blocks are naturally
// aligned, but it's just part of heuristic for block alignment.
if (MIB->isNoop(Instr) && !PreserveNops && !IsSDTMarker && !IsLKMarker) {
IsLastInstrNop = true;
continue;
}
if (!InsertBB) {
// It must be a fallthrough or unreachable code. Create a new block unless
// we see an unconditional branch following a conditional one. The latter

View File

@ -105,7 +105,6 @@ MinBranchClusters("min-branch-clusters",
enum PeepholeOpts : char {
PEEP_NONE = 0x0,
PEEP_SHORTEN = 0x1,
PEEP_DOUBLE_JUMPS = 0x2,
PEEP_TAILCALL_TRAPS = 0x4,
PEEP_USELESS_BRANCHES = 0x8,
@ -119,7 +118,6 @@ Peepholes("peepholes",
cl::value_desc("opt1,opt2,opt3,..."),
cl::values(
clEnumValN(PEEP_NONE, "none", "disable peepholes"),
clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"),
clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps",
"remove double jumps when able"),
clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"),
@ -1025,30 +1023,45 @@ void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
<< ".\n";
}
uint64_t Peepholes::shortenInstructions(BinaryContext &BC,
BinaryFunction &Function) {
MCInst DebugInst;
uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
uint64_t Count = 0;
const BinaryContext &BC = Function.getBinaryContext();
for (BinaryBasicBlock &BB : Function) {
for (MCInst &Inst : BB) {
if (opts::Verbosity > 1) {
DebugInst = Inst;
}
if (BC.MIB->shortenInstruction(Inst)) {
if (opts::Verbosity > 1) {
outs() << "BOLT-INFO: peephole, shortening:\n"
<< "BOLT-INFO: ";
BC.printInstruction(outs(), DebugInst, 0, &Function);
outs() << "BOLT-INFO: to:";
BC.printInstruction(outs(), Inst, 0, &Function);
}
++Count;
MCInst OriginalInst;
if (opts::Verbosity > 2)
OriginalInst = Inst;
if (!BC.MIB->shortenInstruction(Inst))
continue;
if (opts::Verbosity > 2) {
outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
BC.printInstruction(outs(), OriginalInst, 0, &Function);
outs() << "BOLT-INFO: to:";
BC.printInstruction(outs(), Inst, 0, &Function);
}
++Count;
}
}
return Count;
}
void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
std::atomic<uint64_t> NumShortened{0};
if (!BC.isX86())
return;
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
[&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
nullptr, "ShortenInstructions");
outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
}
void Peepholes::addTailcallTraps(BinaryFunction &Function) {
MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
for (BinaryBasicBlock &BB : Function) {
@ -1099,8 +1112,6 @@ void Peepholes::runOnFunctions(BinaryContext &BC) {
for (auto &It : BC.getBinaryFunctions()) {
BinaryFunction &Function = It.second;
if (shouldOptimize(Function)) {
if (Opts & opts::PEEP_SHORTEN)
NumShortened += shortenInstructions(BC, Function);
if (Opts & opts::PEEP_DOUBLE_JUMPS)
NumDoubleJumps += fixDoubleJumps(Function, false);
if (Opts & opts::PEEP_TAILCALL_TRAPS)
@ -1110,9 +1121,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC) {
assert(Function.validateCFG());
}
}
outs() << "BOLT-INFO: Peephole: " << NumShortened
<< " instructions shortened.\n"
<< "BOLT-INFO: Peephole: " << NumDoubleJumps
outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
<< " double jumps patched.\n"
<< "BOLT-INFO: Peephole: " << TailCallTraps
<< " tail call traps inserted.\n"
@ -1837,5 +1846,30 @@ void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
}
}
void RemoveNops::runOnFunction(BinaryFunction &BF) {
const BinaryContext &BC = BF.getBinaryContext();
for (BinaryBasicBlock &BB : BF) {
for (int64_t I = BB.size() - 1; I >= 0; --I) {
MCInst &Inst = BB.getInstructionAtIndex(I);
if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
BB.eraseInstructionAtIndex(I);
}
}
}
void RemoveNops::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
runOnFunction(BF);
};
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return BF.shouldPreserveNops();
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipFunc, "RemoveNops");
}
} // namespace bolt
} // namespace llvm

View File

@ -732,7 +732,22 @@ bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To,
return true;
}
if (To != ToBB->getOffset()) {
bool OffsetMatches = !!(To == ToBB->getOffset());
if (!OffsetMatches) {
// Skip the nops to support old .fdata
uint64_t Offset = ToBB->getOffset();
for (MCInst &Instr : *ToBB) {
if (!BC.MIB->isNoop(Instr))
break;
Offset += BC.MIB->getAnnotationWithDefault<uint32_t>(Instr, "Size");
}
if (To == Offset)
OffsetMatches = true;
}
if (!OffsetMatches) {
// "To" could be referring to nop instructions in between 2 basic blocks.
// While building the CFG we make sure these nops are attributed to the
// previous basic block, thus we check if the destination belongs to the

View File

@ -408,6 +408,10 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
Manager.registerPass(std::make_unique<ValidateInternalCalls>(NeverPrint));
Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint));
Manager.registerPass(std::make_unique<RemoveNops>(NeverPrint));
Manager.registerPass(std::make_unique<NormalizeCFG>(PrintNormalized));
Manager.registerPass(std::make_unique<StripRepRet>(NeverPrint),

View File

@ -1,6 +0,0 @@
.globl main
main:
subq $0x2000000, (%r13)
.byte 0x49,0x81,0x6d,0x00,0x02,0x00,0x00,0x00
xorq %rax,%rax
ret

View File

@ -13,15 +13,14 @@ RUN: -funcs=main,SolveCubic,usqrt -sequential-disassembly 2>&1 | FileCheck %s
CHECK: Binary Function "main"
CHECK: BB Layout : .LBB00, .Ltmp1, .Ltmp0, .Ltmp3, .Ltmp2, .Ltmp5, .Ltmp4, .Ltmp7, .Ltmp6, .Ltmp9, .Ltmp8, .Ltmp11, .Ltmp10, .Ltmp13, .Ltmp12, .Ltmp15, .Ltmp14, .Ltmp21, .Ltmp20, .Ltmp19, .Ltmp18, .Ltmp17, .Ltmp16, .LFT0, .LFT1, .LFT2, .LFT3, .Ltmp22, .LFT4, .Ltmp23, .LFT5, .Ltmp24, .LFT6, .Ltmp25, .LFT7
CHECK: Binary Function "SolveCubic"
CHECK: BB Layout : .LBB01, .LFT8, .LFT9, .Ltmp33, .LFT10, .Ltmp28, .Ltmp26, .LFT11, .Ltmp37, .Ltmp36, .Ltmp35, .Ltmp34, .Ltmp27, .Ltmp32, .Ltmp31, .Ltmp30, .Ltmp29
CHECK: BB Layout : .LBB01, .LFT8, .LFT9, .Ltmp33, .LFT10, .Ltmp28, .LFT11, .Ltmp26, .LFT12, .Ltmp37, .Ltmp36, .Ltmp35, .Ltmp34, .Ltmp27, .Ltmp32, .Ltmp31, .Ltmp30, .Ltmp29
CHECK: Binary Function "usqrt"
CHECK: BB Layout : .LBB02, .Ltmp39, .LFT12, .Ltmp38, .LFT13
CHECK: BB Layout : .LBB02, .Ltmp39, .LFT13, .Ltmp38, .LFT14
# New order
CHECK: Binary Function "main"
CHECK: BB Layout : .LBB00, .Ltmp1, .Ltmp0, .Ltmp3, .Ltmp2, .Ltmp5, .Ltmp4, .Ltmp7, .Ltmp6, .Ltmp9, .Ltmp8, .Ltmp11, .Ltmp10, .Ltmp13, .Ltmp12, .Ltmp15, .Ltmp14, .Ltmp21, .Ltmp16, .Ltmp18, .Ltmp17, .LFT0, .Ltmp19, .LFT1, .Ltmp20, .LFT2, .LFT3, .Ltmp22, .LFT4, .Ltmp23, .LFT5, .Ltmp24, .LFT6, .Ltmp25, .LFT7
CHECK: Binary Function "SolveCubic"
CHECK: BB Layout : .LBB01, .Ltmp26, .LFT11, .Ltmp37, .Ltmp36, .Ltmp35, .Ltmp34, .LFT8, .LFT9, .Ltmp33, .Ltmp28, .LFT10, .Ltmp27, .Ltmp32, .Ltmp31, .Ltmp30, .Ltmp29
CHECK: BB Layout : .LBB01, .Ltmp26, .LFT12, .Ltmp37, .Ltmp36, .Ltmp35, .Ltmp34, .LFT8, .LFT9, .Ltmp33, .Ltmp28, .LFT10, .Ltmp27, .Ltmp32, .Ltmp31, .Ltmp30, .Ltmp29
CHECK: Binary Function "usqrt"
CHECK: BB Layout : .LBB02, .Ltmp38, .Ltmp39, .LFT12, .LFT13
CHECK: BB Layout : .LBB02, .Ltmp38, .Ltmp39, .LFT13, .LFT14

View File

@ -1,11 +0,0 @@
# Checks that peephole pass works.
RUN: %clang %p/Inputs/peephole.s -o %t.exe
RUN: llvm-bolt %t.exe -o %t -peepholes=shorten
RUN: llvm-objdump -d --disassemble-symbols=main %t | FileCheck %s
CHECK: main
CHECK-NEXT: 49 81 6d 00 00 00 00 02 subq $33554432, (%r13)
CHECK-NEXT: 49 83 6d 00 02 subq $2, (%r13)
CHECK-NEXT: 48 31 c0 xorq %rax, %rax
CHECK-NEXT: c3 retq

View File

@ -23,7 +23,7 @@ PERF2BOLT: 1 usqrt 30 1 usqrt 39 4 33
PERF2BOLT: 1 usqrt 35 1 usqrt 39 0 22
PERF2BOLT: 1 usqrt 3d 1 usqrt 10 0 58
PERF2BOLT: 1 usqrt 3d 1 usqrt 3f 0 22
PERF2BOLT: 1 usqrt 8 1 usqrt 10 0 22
PERF2BOLT: 1 usqrt a 1 usqrt 10 0 22
NEWFORMAT: - name: usqrt
NEWFORMAT: fid: 7
@ -31,7 +31,7 @@ NEWFORMAT: exec: 0
NEWFORMAT: nblocks: 5
NEWFORMAT: blocks:
NEWFORMAT: - bid: 0
NEWFORMAT: insns: 3
NEWFORMAT: insns: 4
NEWFORMAT: succ: [ { bid: 1, cnt: 22 } ]
NEWFORMAT: - bid: 1
NEWFORMAT: insns: 9

View File

@ -2,7 +2,7 @@
# that fit in 32 bits are shortened.
RUN: %clang %p/Inputs/asm_main.c %p/Inputs/shorten_mov.s -o %t.exe
RUN: llvm-bolt %t.exe -peepholes=shorten -o %t
RUN: llvm-bolt %t.exe -o %t
RUN: llvm-objdump -d %t --print-imm-hex | FileCheck %s
CHECK: <foo>: