forked from OSchip/llvm-project
[BOLT] Increase coverage of shrink wrapping [5/5]
Add -experimental-shrink-wrapping flag to control when we want to move callee-saved registers even when addresses of the stack frame are captured and used in pointer arithmetic, making it more challenging to do alias analysis to prove that we do not access optimized stack positions. This alias analysis is not yet implemented, hence, it is experimental. In practice, though, no compiler would emit code to do pointer arithmetic to access a saved callee-saved register unless there is a memory bug or we are failing to identify a callee-saved reg, so I'm not sure how useful it would be to formally prove that. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D126115
This commit is contained in:
parent
3e5f67f356
commit
a3cfdd746e
|
@ -121,6 +121,10 @@ class FrameAnalysis {
|
|||
/// Set of functions that require the stack to be 16B aligned
|
||||
DenseSet<const BinaryFunction *> FunctionsRequireAlignment;
|
||||
|
||||
/// Set of functions that performs computations with stack addresses and
|
||||
/// complicates our understanding of aliasing of stack spaces.
|
||||
DenseSet<const BinaryFunction *> FunctionsWithStackArithmetic;
|
||||
|
||||
/// Owns ArgAccesses for all instructions. References to elements are
|
||||
/// attached to instructions as indexes to this vector, in MCAnnotations.
|
||||
std::vector<ArgAccesses> ArgAccessesVector;
|
||||
|
@ -183,6 +187,12 @@ public:
|
|||
return FunctionsRequireAlignment.count(&Func);
|
||||
}
|
||||
|
||||
/// Return true if \p Func does computation with the address of any stack
|
||||
/// position, meaning we have limited alias analysis on this function.
|
||||
bool hasStackArithmetic(const BinaryFunction &Func) const {
|
||||
return FunctionsWithStackArithmetic.count(&Func);
|
||||
}
|
||||
|
||||
/// Functions for retrieving our specific MCAnnotation data from instructions
|
||||
ErrorOr<ArgAccesses &> getArgAccessesFor(const MCInst &Inst);
|
||||
|
||||
|
|
|
@ -38,6 +38,11 @@ static cl::opt<std::string> FrameOptFunctionNamesFile(
|
|||
static cl::opt<bool> TimeFA("time-fa", cl::desc("time frame analysis steps"),
|
||||
cl::ReallyHidden, cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
ExperimentalSW("experimental-shrink-wrapping",
|
||||
cl::desc("process functions with stack pointer arithmetic"),
|
||||
cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
||||
|
||||
bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
|
||||
if (Function.hasUnknownControlFlow())
|
||||
return false;
|
||||
|
@ -111,6 +116,7 @@ class FrameAccessAnalysis {
|
|||
const MCInst *Prev{nullptr};
|
||||
/// Info about the last frame access
|
||||
bool IsValidAccess{false};
|
||||
bool EscapesStackAddress{false};
|
||||
FrameIndexEntry FIE;
|
||||
|
||||
bool decodeFrameAccess(const MCInst &Inst) {
|
||||
|
@ -124,14 +130,15 @@ class FrameAccessAnalysis {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (IsIndexed || FIE.Size == 0) {
|
||||
if (IsIndexed || (!FIE.Size && (FIE.IsLoad || FIE.IsStore))) {
|
||||
LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
|
||||
LLVM_DEBUG(dbgs() << "Blame insn: ");
|
||||
LLVM_DEBUG(BC.printInstruction(outs(), Inst, 0, &BF, true, false, false));
|
||||
LLVM_DEBUG(Inst.dump());
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(FIE.Size != 0);
|
||||
assert(FIE.Size != 0 || (!FIE.IsLoad && !FIE.IsStore));
|
||||
|
||||
FIE.RegOrImm = SrcImm;
|
||||
if (FIE.IsLoad || FIE.IsStoreFromReg)
|
||||
|
@ -171,9 +178,11 @@ public:
|
|||
const FrameIndexEntry &getFIE() const { return FIE; }
|
||||
int getSPOffset() const { return SPOffset; }
|
||||
bool isValidAccess() const { return IsValidAccess; }
|
||||
bool doesEscapeStackAddress() const { return EscapesStackAddress; }
|
||||
|
||||
bool doNext(const BinaryBasicBlock &BB, const MCInst &Inst) {
|
||||
IsValidAccess = false;
|
||||
EscapesStackAddress = false;
|
||||
std::tie(SPOffset, FPOffset) =
|
||||
Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB);
|
||||
Prev = &Inst;
|
||||
|
@ -214,11 +223,14 @@ public:
|
|||
}
|
||||
|
||||
if (BC.MIB->escapesVariable(Inst, SPT.HasFramePointer)) {
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "Leaked stack address, giving up on this function.\n");
|
||||
LLVM_DEBUG(dbgs() << "Blame insn: ");
|
||||
LLVM_DEBUG(Inst.dump());
|
||||
return false;
|
||||
EscapesStackAddress = true;
|
||||
if (!opts::ExperimentalSW) {
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "Leaked stack address, giving up on this function.\n");
|
||||
LLVM_DEBUG(dbgs() << "Blame insn: ");
|
||||
LLVM_DEBUG(Inst.dump());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return decodeFrameAccess(Inst);
|
||||
|
@ -405,7 +417,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
|
|||
FAA.enterNewBB();
|
||||
|
||||
for (MCInst &Inst : *BB) {
|
||||
if (!FAA.doNext(*BB, Inst)) {
|
||||
if (!FAA.doNext(*BB, Inst) || FAA.doesEscapeStackAddress()) {
|
||||
ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0));
|
||||
NoInfo = true;
|
||||
break;
|
||||
|
@ -475,6 +487,12 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
|
|||
dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n";
|
||||
});
|
||||
|
||||
if (FAA.doesEscapeStackAddress()) {
|
||||
if (!FunctionsWithStackArithmetic.count(&BF))
|
||||
FunctionsWithStackArithmetic.insert(&BF);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!FAA.isValidAccess())
|
||||
continue;
|
||||
|
||||
|
|
|
@ -268,13 +268,15 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
|
|||
{
|
||||
NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
|
||||
opts::TimeOpts);
|
||||
removeUnnecessaryLoads(*RA, *FA, I.second);
|
||||
if (!FA->hasStackArithmetic(I.second))
|
||||
removeUnnecessaryLoads(*RA, *FA, I.second);
|
||||
}
|
||||
|
||||
if (opts::RemoveStores) {
|
||||
NamedRegionTimer T1("removestores", "remove stores", "FOP",
|
||||
"FOP breakdown", opts::TimeOpts);
|
||||
removeUnusedStores(*FA, I.second);
|
||||
if (!FA->hasStackArithmetic(I.second))
|
||||
removeUnusedStores(*FA, I.second);
|
||||
}
|
||||
// Don't even start shrink wrapping if no profiling info is available
|
||||
if (I.second.getKnownExecutionCount() == 0)
|
||||
|
|
|
@ -1066,6 +1066,14 @@ bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
|
|||
});
|
||||
return false;
|
||||
}
|
||||
if (FA.hasStackArithmetic(BF)) {
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "Reg " << CSR
|
||||
<< " is not using push/pops due to function "
|
||||
"taking the address of a stack position.\n";
|
||||
});
|
||||
return false;
|
||||
}
|
||||
for (MCInst *Save : CSA.getSavesByReg(CSR)) {
|
||||
if (!SLM.canCollapseRegion(Save)) {
|
||||
LLVM_DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n");
|
||||
|
|
|
@ -1091,8 +1091,10 @@ public:
|
|||
bool IsLoad = MCII.mayLoad();
|
||||
bool IsStore = MCII.mayStore();
|
||||
// Is it LEA? (deals with memory but is not loading nor storing)
|
||||
if (!IsLoad && !IsStore)
|
||||
return false;
|
||||
if (!IsLoad && !IsStore) {
|
||||
I = {0, IsLoad, IsStore, false, false};
|
||||
break;
|
||||
}
|
||||
uint8_t Sz = getMemDataSize(Inst, MemOpNo);
|
||||
I = {Sz, IsLoad, IsStore, false, false};
|
||||
break;
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
# This checks that frame optimizer does not try to optimize away caller-saved
|
||||
# regs when we do not have complete aliasing info (when there is an LEA
|
||||
# instruction and the function does arithmetic with stack addresses).
|
||||
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
||||
# RUN: %s -o %t.o
|
||||
# RUN: link_fdata %s %t.o %t.fdata
|
||||
# RUN: llvm-strip --strip-unneeded %t.o
|
||||
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
|
||||
# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
|
||||
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
|
||||
# RUN: -lite=0 -eliminate-unreachable=false | FileCheck %s
|
||||
# RUN: llvm-objdump -d %t.out --print-imm-hex | \
|
||||
# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
|
||||
|
||||
.globl foo
|
||||
.type foo, %function
|
||||
foo:
|
||||
.cfi_startproc
|
||||
movq $0, (%rsi)
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size foo, .-foo
|
||||
|
||||
|
||||
.globl _start
|
||||
.type _start, %function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
# FDATA: 0 [unknown] 0 1 _start 0 0 1
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
subq $0x20, %rsp
|
||||
je b
|
||||
c:
|
||||
addq $0x20, %rsp
|
||||
pop %rbp
|
||||
ret
|
||||
b:
|
||||
movq %rdi, %r13
|
||||
movq %r13, -0x08(%rbp)
|
||||
leaq -0x08(%rbp), %rsi
|
||||
callq foo
|
||||
movq -0x08(%rbp), %r13
|
||||
jmp c
|
||||
.cfi_endproc
|
||||
.size _start, .-_start
|
||||
|
||||
|
||||
# CHECK: BOLT-INFO: FOP deleted 0 load(s) (dyn count: 0) and 0 store(s)
|
||||
|
||||
# CHECK-OBJDUMP: <_start>:
|
||||
# CHECK-OBJDUMP: movq %rdi, %r13
|
||||
# CHECK-OBJDUMP-NEXT: movq %r13, -0x8(%rbp)
|
||||
# CHECK-OBJDUMP-NEXT: leaq
|
||||
# CHECK-OBJDUMP-NEXT: callq
|
||||
# CHECK-OBJDUMP-NEXT: movq -0x8(%rbp), %r13
|
|
@ -0,0 +1,65 @@
|
|||
# This checks that shrink wrapping correctly drops moving push/pops when
|
||||
# there is an LEA instruction.
|
||||
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
||||
# RUN: %s -o %t.o
|
||||
# RUN: link_fdata %s %t.o %t.fdata
|
||||
# RUN: llvm-strip --strip-unneeded %t.o
|
||||
# RUN: ld.lld %t.o -o %t.exe -q
|
||||
# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
|
||||
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
|
||||
# RUN: -experimental-shrink-wrapping \
|
||||
# RUN: -eliminate-unreachable=false | FileCheck %s
|
||||
# RUN: llvm-objdump -d %t.out --print-imm-hex | \
|
||||
# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
|
||||
|
||||
.globl _start
|
||||
.type _start, %function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
# FDATA: 0 [unknown] 0 1 _start 0 0 1
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r14
|
||||
subq $0x20, %rsp
|
||||
je b
|
||||
c:
|
||||
addq $0x20, %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
b:
|
||||
je f
|
||||
jmp *JT(,%rdi,8)
|
||||
d:
|
||||
mov %r14, %rdi
|
||||
mov %rbx, %rdi
|
||||
leaq -0x20(%rbp), %r14
|
||||
movq -0x20(%rbp), %rdi
|
||||
f:
|
||||
addq $0x20, %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size _start, .-_start
|
||||
.data
|
||||
JT:
|
||||
.quad c
|
||||
.quad d
|
||||
.quad f
|
||||
|
||||
|
||||
# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
|
||||
|
||||
# Checks that offsets of instructions accessing the stack were not changed
|
||||
# CHECK-OBJDUMP: <_start>:
|
||||
# CHECK-OBJDUMP: movq %rbx, %rdi
|
||||
# CHECK-OBJDUMP-NEXT: leaq -0x20(%rbp), %r14
|
||||
# CHECK-OBJDUMP: movq -0x20(%rbp), %rdi
|
|
@ -0,0 +1,58 @@
|
|||
# This checks that shrink wrapping correctly drops moving push/pops when
|
||||
# there is a MOV instruction loading the value of the stack pointer in
|
||||
# order to do pointer arithmetic with a stack address.
|
||||
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
||||
# RUN: %s -o %t.o
|
||||
# RUN: link_fdata %s %t.o %t.fdata
|
||||
# RUN: llvm-strip --strip-unneeded %t.o
|
||||
# RUN: ld.lld %t.o -o %t.exe -q
|
||||
# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
|
||||
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
|
||||
# RUN: -experimental-shrink-wrapping \
|
||||
# RUN: -eliminate-unreachable=false | FileCheck %s
|
||||
|
||||
.globl _start
|
||||
.type _start, %function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
# FDATA: 0 [unknown] 0 1 _start 0 0 1
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r14
|
||||
subq $0x20, %rsp
|
||||
je b
|
||||
c:
|
||||
addq $0x20, %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
b:
|
||||
je f
|
||||
jmp *JT(,%rdi,8)
|
||||
d:
|
||||
mov %r14, %rdi
|
||||
mov %rbx, %rdi
|
||||
mov %rbp, %rdi
|
||||
sub 0x20, %rdi
|
||||
f:
|
||||
addq $0x20, %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size _start, .-_start
|
||||
.data
|
||||
JT:
|
||||
.quad c
|
||||
.quad d
|
||||
.quad f
|
||||
|
||||
|
||||
# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
|
|
@ -21,6 +21,7 @@ c:
|
|||
pushf
|
||||
push %rbx
|
||||
push %rbp
|
||||
addq $0x50, -0x30(%rbp)
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
popf
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
# This checks that shrink wrapping uses the red zone defined in the X86 ABI by
|
||||
# placing restores that access elements already deallocated by the stack.
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
||||
# RUN: %s -o %t.o
|
||||
# RUN: link_fdata %s %t.o %t.fdata
|
||||
# RUN: llvm-strip --strip-unneeded %t.o
|
||||
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
|
||||
# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \
|
||||
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
|
||||
# RUN: -experimental-shrink-wrapping \
|
||||
# RUN: -eliminate-unreachable=false | FileCheck %s
|
||||
# RUN: llvm-objdump -d %t.out --print-imm-hex | \
|
||||
# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
|
||||
|
||||
|
||||
# Here we create a CFG where the restore position matches the previous (deleted)
|
||||
# restore position. Shrink wrapping then will put a stack access to an element
|
||||
# that was deallocated at the previously deleted POP, which falls in the red
|
||||
# zone and should be safe for X86 Linux ABI.
|
||||
.globl _start
|
||||
.type _start, %function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
# FDATA: 0 [unknown] 0 1 _start 0 0 1
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r14
|
||||
subq $0x20, %rsp
|
||||
b: je hot_path
|
||||
# FDATA: 1 _start #b# 1 _start #hot_path# 0 1
|
||||
cold_path:
|
||||
mov %r14, %rdi
|
||||
mov %rbx, %rdi
|
||||
movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs
|
||||
leaq -0x20(%rbp), %r14
|
||||
movq -0x20(%rbp), %rdi
|
||||
leaq -0x10(%rbp), %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
hot_path:
|
||||
addq $0x20, %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size _start, .-_start
|
||||
|
||||
.data
|
||||
rel: .quad cold_path
|
||||
|
||||
# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
|
||||
|
||||
# CHECK-OBJDUMP: <_start>:
|
||||
# CHECK-OBJDUMP: leaq (%rbp), %rsp
|
||||
# CHECK-OBJDUMP-NEXT: popq %rbp
|
||||
# CHECK-OBJDUMP-NEXT: movq -0x10(%rsp), %rbx
|
||||
# CHECK-OBJDUMP-NEXT: movq -0x18(%rsp), %r14
|
||||
# CHECK-OBJDUMP-NEXT: retq
|
Loading…
Reference in New Issue