[X86] Add Indirect Thunk Support to X86 to mitigate Load Value Injection (LVI)

This pass replaces each indirect call/jump with a direct call to a thunk that looks like:

lfence
jmpq *%r11

This ensures that if the value in register %r11 was loaded from memory, then
the value in %r11 is (architecturally) correct prior to the jump.
Also adds a new target feature to X86: +lvi-cfi
("cfi" meaning control-flow integrity)
The feature can be added via clang CLI using -mlvi-cfi.

This is an alternate implementation to https://reviews.llvm.org/D75934 That merges the thunk insertion functionality with the existing X86 retpoline code.

Differential Revision: https://reviews.llvm.org/D76812
This commit is contained in:
Scott Constable 2020-04-02 21:59:47 -07:00 committed by Craig Topper
parent 0a3845b70f
commit 5b519cf1fc
9 changed files with 379 additions and 8 deletions

View File

@ -2625,6 +2625,10 @@ Use Intel MCU ABI
Generate branches with extended addressability, usually via indirect jumps.
.. option:: -mlvi-cfi, -mno-lvi-cfi
Enable only control-flow mitigations for Load Value Injection (LVI)
.. option:: -mmacosx-version-min=<arg>, -mmacos-version-min=<arg>
Set Mac OS X deployment target

View File

@ -2309,6 +2309,10 @@ def mspeculative_load_hardening : Flag<["-"], "mspeculative-load-hardening">,
Group<m_Group>, Flags<[CoreOption,CC1Option]>;
def mno_speculative_load_hardening : Flag<["-"], "mno-speculative-load-hardening">,
Group<m_Group>, Flags<[CoreOption]>;
def mlvi_cfi : Flag<["-"], "mlvi-cfi">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
HelpText<"Enable only control-flow mitigations for Load Value Injection (LVI)">;
def mno_lvi_cfi : Flag<["-"], "mno-lvi-cfi">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
HelpText<"Disable control-flow mitigations for Load Value Injection (LVI)">;
def mrelax : Flag<["-"], "mrelax">, Group<m_riscv_Features_Group>,
HelpText<"Enable linker relaxation">;

View File

@ -146,6 +146,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
// flags). This is a bit hacky but keeps existing usages working. We should
// consider deprecating this and instead warn if the user requests external
// retpoline thunks and *doesn't* request some form of retpolines.
auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
options::OPT_mspeculative_load_hardening,
options::OPT_mno_speculative_load_hardening)) {
@ -153,12 +154,14 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
false)) {
Features.push_back("+retpoline-indirect-calls");
Features.push_back("+retpoline-indirect-branches");
SpectreOpt = options::OPT_mretpoline;
} else if (Args.hasFlag(options::OPT_mspeculative_load_hardening,
options::OPT_mno_speculative_load_hardening,
false)) {
// On x86, speculative load hardening relies on at least using retpolines
// for indirect calls.
Features.push_back("+retpoline-indirect-calls");
SpectreOpt = options::OPT_mspeculative_load_hardening;
}
} else if (Args.hasFlag(options::OPT_mretpoline_external_thunk,
options::OPT_mno_retpoline_external_thunk, false)) {
@ -166,6 +169,20 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
// eventually switch to an error here.
Features.push_back("+retpoline-indirect-calls");
Features.push_back("+retpoline-indirect-branches");
SpectreOpt = options::OPT_mretpoline_external_thunk;
}
auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
if (Args.hasFlag(options::OPT_mlvi_cfi, options::OPT_mno_lvi_cfi, false)) {
Features.push_back("+lvi-cfi");
LVIOpt = options::OPT_mlvi_cfi;
}
if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
LVIOpt != clang::driver::options::ID::OPT_INVALID) {
D.Diag(diag::err_drv_argument_not_allowed_with)
<< D.getOpts().getOptionName(SpectreOpt)
<< D.getOpts().getOptionName(LVIOpt);
}
// Now add any that the user explicitly requested on the command line,

View File

@ -154,6 +154,11 @@
// SLH: "-mspeculative-load-hardening"
// NO-SLH-NOT: retpoline
// RUN: %clang -target i386-linux-gnu -mlvi-cfi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVICFI %s
// RUN: %clang -target i386-linux-gnu -mno-lvi-cfi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-LVICFI %s
// LVICFI: "-target-feature" "+lvi-cfi"
// NO-LVICFI-NOT: lvi-cfi
// RUN: %clang -target i386-linux-gnu -mwaitpkg %s -### -o %t.o 2>&1 | FileCheck -check-prefix=WAITPKG %s
// RUN: %clang -target i386-linux-gnu -mno-waitpkg %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-WAITPKG %s
// WAITPKG: "-target-feature" "+waitpkg"

View File

@ -433,6 +433,15 @@ def FeatureRetpolineExternalThunk
"ourselves. Only has effect when combined with some other retpoline "
"feature", [FeatureRetpolineIndirectCalls]>;
// Mitigate LVI attacks against indirect calls/branches and call returns
def FeatureLVIControlFlowIntegrity
: SubtargetFeature<
"lvi-cfi", "UseLVIControlFlowIntegrity", "true",
"Prevent indirect calls/branches from using a memory operand, and "
"precede all indirect calls/branches from a register with an "
"LFENCE instruction to serialize control flow. Also decompose RET "
"instructions into a POP+LFENCE+JMP sequence.">;
// Direct Move instructions.
def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
"Support movdiri instruction">;

View File

@ -31972,6 +31972,11 @@ static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
}
llvm_unreachable("unexpected reg for retpoline");
}
if (Subtarget.useLVIControlFlowIntegrity()) {
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_lvi_thunk_r11";
}
llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
}

View File

@ -14,6 +14,8 @@
///
/// Currently supported thunks include:
/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
/// before making an indirect call/jump
///
/// Note that the reason that this is implemented as a MachineFunctionPass and
/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
@ -44,11 +46,14 @@ using namespace llvm;
#define DEBUG_TYPE "x86-retpoline-thunks"
static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
static const char R11RetpolineName[] = "__llvm_retpoline_r11";
static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
static const char R11RetpolineName[] = "__llvm_retpoline_r11";
static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
namespace {
template <typename Derived> class ThunkInserter {
@ -80,6 +85,38 @@ struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
void populateThunk(MachineFunction &MF);
};
struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
const char *getThunkPrefix() { return LVIThunkNamePrefix; }
bool mayUseThunk(const MachineFunction &MF) {
return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
}
void insertThunks(MachineModuleInfo &MMI) {
createThunkFunction(MMI, R11LVIThunkName);
}
void populateThunk(MachineFunction &MF) {
// Grab the entry MBB and erase any other blocks. O0 codegen appears to
// generate two bbs for the entry block.
MachineBasicBlock *Entry = &MF.front();
Entry->clear();
while (MF.size() > 1)
MF.erase(std::next(MF.begin()));
// This code mitigates LVI by replacing each indirect call/jump with a
// direct call/jump to a thunk that looks like:
// ```
// lfence
// jmpq *%r11
// ```
// This ensures that if the value in register %r11 was loaded from memory,
// then the value in %r11 is (architecturally) correct prior to the jump.
const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
MF.front().addLiveIn(X86::R11);
return;
}
};
class X86IndirectThunks : public MachineFunctionPass {
public:
static char ID;
@ -98,7 +135,7 @@ public:
}
private:
std::tuple<RetpolineThunkInserter> TIs;
std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
// FIXME: When LLVM moves to C++17, these can become folds
template <typename... ThunkInserterT>

View File

@ -428,6 +428,12 @@ protected:
/// than emitting one inside the compiler.
bool UseRetpolineExternalThunk = false;
/// Prevent generation of indirect call/branch instructions from memory,
/// and force all indirect call/branch instructions from a register to be
/// preceded by an LFENCE. Also decompose RET instructions into a
/// POP+LFENCE+JMP sequence.
bool UseLVIControlFlowIntegrity = false;
/// Use software floating point for code generation.
bool UseSoftFloat = false;
@ -719,13 +725,16 @@ public:
// These are generic getters that OR together all of the thunk types
// supported by the subtarget. Therefore useIndirectThunk*() will return true
// if any respective thunk feature is enabled.
bool useIndirectThunkCalls() const { return useRetpolineIndirectCalls(); }
bool useIndirectThunkCalls() const {
return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
}
bool useIndirectThunkBranches() const {
return useRetpolineIndirectBranches();
return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
}
bool preferMaskRegisters() const { return PreferMaskRegisters; }
bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }

View File

@ -0,0 +1,281 @@
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi < %s | FileCheck %s --check-prefix=X64
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi -O0 < %s | FileCheck %s --check-prefix=X64FAST
;
; Note that a lot of this code was lifted from retpoline.ll.
declare void @bar(i32)
; Test a simple indirect call and tail call.
define void @icall_reg(void (i32)* %fp, i32 %x) {
entry:
tail call void @bar(i32 %x)
tail call void %fp(i32 %x)
tail call void @bar(i32 %x)
tail call void %fp(i32 %x)
ret void
}
; X64-LABEL: icall_reg:
; X64-DAG: movq %rdi, %[[fp:[^ ]*]]
; X64-DAG: movl %esi, %[[x:[^ ]*]]
; X64: movl %esi, %edi
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
; X64: callq __llvm_lvi_thunk_r11
; X64: movl %[[x]], %edi
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL
; X64FAST-LABEL: icall_reg:
; X64FAST: callq bar
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: callq bar
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL
@global_fp = external global void (i32)*
; Test an indirect call through a global variable.
define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
%fp1 = load void (i32)*, void (i32)** @global_fp
call void %fp1(i32 %x)
%fp2 = load void (i32)*, void (i32)** @global_fp
tail call void %fp2(i32 %x)
ret void
}
; X64-LABEL: icall_global_fp:
; X64-DAG: movl %edi, %[[x:[^ ]*]]
; X64-DAG: movq global_fp(%rip), %r11
; X64: callq __llvm_lvi_thunk_r11
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq global_fp(%rip), %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL
; X64FAST-LABEL: icall_global_fp:
; X64FAST: movq global_fp(%rip), %r11
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: movq global_fp(%rip), %r11
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL
%struct.Foo = type { void (%struct.Foo*)** }
; Test an indirect call through a vtable.
define void @vcall(%struct.Foo* %obj) #0 {
%vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0
%vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field
%vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1
%fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot
tail call void %fp(%struct.Foo* %obj)
tail call void %fp(%struct.Foo* %obj)
ret void
}
; X64-LABEL: vcall:
; X64: movq %rdi, %[[obj:[^ ]*]]
; X64: movq (%rdi), %[[vptr:[^ ]*]]
; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]]
; X64: movq %[[fp]], %r11
; X64: callq __llvm_lvi_thunk_r11
; X64-DAG: movq %[[obj]], %rdi
; X64-DAG: movq %[[fp]], %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL
; X64FAST-LABEL: vcall:
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL
declare void @direct_callee()
define void @direct_tail() #0 {
tail call void @direct_callee()
ret void
}
; X64-LABEL: direct_tail:
; X64: jmp direct_callee # TAILCALL
; X64FAST-LABEL: direct_tail:
; X64FAST: jmp direct_callee # TAILCALL
declare void @nonlazybind_callee() #1
define void @nonlazybind_caller() #0 {
call void @nonlazybind_callee()
tail call void @nonlazybind_callee()
ret void
}
; X64-LABEL: nonlazybind_caller:
; X64: movq nonlazybind_callee@GOTPCREL(%rip), %[[REG:.*]]
; X64: movq %[[REG]], %r11
; X64: callq __llvm_lvi_thunk_r11
; X64: movq %[[REG]], %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL
; X64FAST-LABEL: nonlazybind_caller:
; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL
; Check that a switch gets lowered using a jump table
define void @switch_jumptable(i32* %ptr, i64* %sink) #0 {
; X64-LABEL: switch_jumptable:
; X64_NOT: jmpq *
entry:
br label %header
header:
%i = load volatile i32, i32* %ptr
switch i32 %i, label %bb0 [
i32 1, label %bb1
i32 2, label %bb2
i32 3, label %bb3
i32 4, label %bb4
i32 5, label %bb5
i32 6, label %bb6
i32 7, label %bb7
i32 8, label %bb8
i32 9, label %bb9
]
bb0:
store volatile i64 0, i64* %sink
br label %header
bb1:
store volatile i64 1, i64* %sink
br label %header
bb2:
store volatile i64 2, i64* %sink
br label %header
bb3:
store volatile i64 3, i64* %sink
br label %header
bb4:
store volatile i64 4, i64* %sink
br label %header
bb5:
store volatile i64 5, i64* %sink
br label %header
bb6:
store volatile i64 6, i64* %sink
br label %header
bb7:
store volatile i64 7, i64* %sink
br label %header
bb8:
store volatile i64 8, i64* %sink
br label %header
bb9:
store volatile i64 9, i64* %sink
br label %header
}
@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0),
i8* blockaddress(@indirectbr_rewrite, %bb1),
i8* blockaddress(@indirectbr_rewrite, %bb2),
i8* blockaddress(@indirectbr_rewrite, %bb3),
i8* blockaddress(@indirectbr_rewrite, %bb4),
i8* blockaddress(@indirectbr_rewrite, %bb5),
i8* blockaddress(@indirectbr_rewrite, %bb6),
i8* blockaddress(@indirectbr_rewrite, %bb7),
i8* blockaddress(@indirectbr_rewrite, %bb8),
i8* blockaddress(@indirectbr_rewrite, %bb9)]
; Check that when thunks are enabled the indirectbr instruction gets
; rewritten to use switch, and that in turn doesn't get lowered as a jump
; table.
define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 {
; X64-LABEL: indirectbr_rewrite:
; X64-NOT: jmpq *
entry:
%i0 = load i64, i64* %p
%target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0
%target0 = load i8*, i8** %target.i0
indirectbr i8* %target0, [label %bb1, label %bb3]
bb0:
store volatile i64 0, i64* %sink
br label %latch
bb1:
store volatile i64 1, i64* %sink
br label %latch
bb2:
store volatile i64 2, i64* %sink
br label %latch
bb3:
store volatile i64 3, i64* %sink
br label %latch
bb4:
store volatile i64 4, i64* %sink
br label %latch
bb5:
store volatile i64 5, i64* %sink
br label %latch
bb6:
store volatile i64 6, i64* %sink
br label %latch
bb7:
store volatile i64 7, i64* %sink
br label %latch
bb8:
store volatile i64 8, i64* %sink
br label %latch
bb9:
store volatile i64 9, i64* %sink
br label %latch
latch:
%i.next = load i64, i64* %p
%target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next
%target.next = load i8*, i8** %target.i.next
; Potentially hit a full 10 successors here so that even if we rewrite as
; a switch it will try to be lowered with a jump table.
indirectbr i8* %target.next, [label %bb0,
label %bb1,
label %bb2,
label %bb3,
label %bb4,
label %bb5,
label %bb6,
label %bb7,
label %bb8,
label %bb9]
}
; Lastly check that the necessary thunks were emitted.
;
; X64-LABEL: .section .text.__llvm_lvi_thunk_r11,{{.*}},__llvm_lvi_thunk_r11,comdat
; X64-NEXT: .hidden __llvm_lvi_thunk_r11
; X64-NEXT: .weak __llvm_lvi_thunk_r11
; X64: __llvm_lvi_thunk_r11:
; X64-NEXT: # {{.*}} # %entry
; X64-NEXT: lfence
; X64-NEXT: jmpq *%r11
attributes #1 = { nonlazybind }