From 7fdf270965584a3b63ffed85d3c1ef20b3510668 Mon Sep 17 00:00:00 2001 From: Jianzhou Zhao Date: Wed, 21 Apr 2021 04:54:29 +0000 Subject: [PATCH] [dfsan] Track origin at loads The first version of origin tracking tracks only memory stores. Although this is sufficient for understanding correct flows, it is hard to figure out where an undefined value is read from. To find reading undefined values, we still have to do a reverse binary search from the last store in the chain with printing and logging at possible code paths. This is quite inefficient. Tracking memory load instructions can help this case. The main issues of tracking loads are performance and code size overheads. With tracking only stores, the code size overhead is 38%, memory overhead is 1x, and cpu overhead is 3x. In practice #load is much larger than #store, so both code size and cpu overhead increases. The first blocker is code size overhead: link fails if we inline tracking loads. The workaround is using external function calls to propagate metadata. This is also the workaround ASan uses. The cpu overhead is ~10x. This is a trade off between debuggability and performance, and will be used only when debugging cases that tracking only stores is not enough. Reviewed By: gbalats Differential Revision: https://reviews.llvm.org/D100967 --- compiler-rt/lib/dfsan/dfsan.cpp | 20 ++++- compiler-rt/test/dfsan/origin_track_ld.c | 31 +++++++ .../Instrumentation/DataFlowSanitizer.cpp | 82 +++++++++++++++++-- .../DataFlowSanitizer/basic.ll | 1 + .../DataFlowSanitizer/origin_track_load.ll | 32 ++++++++ 5 files changed, 153 insertions(+), 13 deletions(-) create mode 100644 compiler-rt/test/dfsan/origin_track_ld.c create mode 100644 llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 2aff8869d2cf..e60703cc4067 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -559,14 +559,26 @@ static void WriteShadowIfDifferent(dfsan_label label, uptr shadow_addr, } } +#define RET_CHAIN_ORIGIN(id) \ + GET_CALLER_PC_BP_SP; \ + (void)sp; \ + GET_STORE_STACK_TRACE_PC_BP(pc, bp); \ + return ChainOrigin(id, &stack); + // Return a new origin chain with the previous ID id and the current stack // trace. extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_origin __dfsan_chain_origin(dfsan_origin id) { - GET_CALLER_PC_BP_SP; - (void)sp; - GET_STORE_STACK_TRACE_PC_BP(pc, bp); - return ChainOrigin(id, &stack); + RET_CHAIN_ORIGIN(id) +} + +// Return a new origin chain with the previous ID id and the current stack +// trace if the label is tainted. +extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_origin +__dfsan_chain_origin_if_tainted(dfsan_label label, dfsan_origin id) { + if (!label) + return id; + RET_CHAIN_ORIGIN(id) } // Copy or move the origins of the len bytes from src to dst. diff --git a/compiler-rt/test/dfsan/origin_track_ld.c b/compiler-rt/test/dfsan/origin_track_ld.c new file mode 100644 index 000000000000..96edbea5381e --- /dev/null +++ b/compiler-rt/test/dfsan/origin_track_ld.c @@ -0,0 +1,31 @@ +// RUN: %clang_dfsan -gmlt -mllvm -dfsan-track-origins=2 -mllvm -dfsan-fast-16-labels=true %s -o %t && \ +// RUN: %run %t > %t.out 2>&1 +// RUN: FileCheck %s < %t.out +// +// REQUIRES: x86_64-target-arch + +#include + +__attribute__((noinline)) uint64_t foo(uint64_t a, uint64_t b) { return a + b; } + +int main(int argc, char *argv[]) { + uint64_t a = 10; + uint64_t b = 20; + dfsan_set_label(8, &a, sizeof(a)); + uint64_t c = foo(a, b); + dfsan_print_origin_trace(&c, NULL); +} + +// CHECK: Taint value 0x8 {{.*}} origin tracking () +// CHECK: Origin value: {{.*}}, Taint value was stored to memory at +// CHECK: #0 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-6]] + +// CHECK: Origin value: {{.*}}, Taint value was stored to memory at +// CHECK: #0 {{.*}} in dfs$foo {{.*}}origin_track_ld.c:[[@LINE-15]] +// CHECK: #1 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-10]] + +// CHECK: Origin value: {{.*}}, Taint value was stored to memory at +// CHECK: #0 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-13]] + +// CHECK: Origin value: {{.*}}, Taint value was created at +// CHECK: #0 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-17]] diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 35315fe79fc1..19dd41904779 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -256,7 +256,8 @@ static cl::opt ClInstrumentWithCallThreshold( // Controls how to track origins. // * 0: do not track origins. // * 1: track origins at memory store operations. -// * 2: TODO: track origins at memory store operations and callsites. +// * 2: track origins at memory load and store operations. +// TODO: track callsites. static cl::opt ClTrackOrigins("dfsan-track-origins", cl::desc("Track origins of labels"), cl::Hidden, cl::init(0)); @@ -453,6 +454,7 @@ class DataFlowSanitizer { FunctionType *DFSanLoadStoreCallbackFnTy; FunctionType *DFSanMemTransferCallbackFnTy; FunctionType *DFSanChainOriginFnTy; + FunctionType *DFSanChainOriginIfTaintedFnTy; FunctionType *DFSanMemOriginTransferFnTy; FunctionType *DFSanMaybeStoreOriginFnTy; FunctionCallee DFSanUnionFn; @@ -469,6 +471,7 @@ class DataFlowSanitizer { FunctionCallee DFSanMemTransferCallbackFn; FunctionCallee DFSanCmpCallbackFn; FunctionCallee DFSanChainOriginFn; + FunctionCallee DFSanChainOriginIfTaintedFn; FunctionCallee DFSanMemOriginTransferFn; FunctionCallee DFSanMaybeStoreOriginFn; SmallPtrSet DFSanRuntimeFunctions; @@ -637,9 +640,18 @@ struct DFSanFunction { Value *combineShadowsThenConvert(Type *T, Value *V1, Value *V2, Instruction *Pos); Value *combineOperandShadows(Instruction *Inst); - std::pair loadShadowOrigin(Value *ShadowAddr, uint64_t Size, + + /// Generates IR to load shadow and origin corresponding to bytes [\p + /// Addr, \p Addr + \p Size), where addr has alignment \p + /// InstAlignment, and take the union of each of those shadows. The returned + /// shadow always has primitive type. + /// + /// When tracking loads is enabled, the returned origin is a chain at the + /// current stack if the returned shadow is tainted. + std::pair loadShadowOrigin(Value *Addr, uint64_t Size, Align InstAlignment, Instruction *Pos); + void storePrimitiveShadowOrigin(Value *Addr, uint64_t Size, Align InstAlignment, Value *PrimitiveShadow, Value *Origin, Instruction *Pos); @@ -695,11 +707,18 @@ private: /// additional call with many instructions. To ensure common cases are fast, /// checks if it is possible to load labels and origins without using the /// callback function. + /// + /// When enabling tracking load instructions, we always use + /// __dfsan_load_label_and_origin to reduce code size. bool useCallbackLoadLabelAndOrigin(uint64_t Size, Align InstAlignment); /// Returns a chain at the current stack with previous origin V. Value *updateOrigin(Value *V, IRBuilder<> &IRB); + /// Returns a chain at the current stack with previous origin V if Shadow is + /// tainted. + Value *updateOriginIfTainted(Value *Shadow, Value *Origin, IRBuilder<> &IRB); + /// Creates an Intptr = Origin | Origin << 32 if Intptr's size is 64. Returns /// Origin otherwise. Value *originToIntptr(IRBuilder<> &IRB, Value *Origin); @@ -722,6 +741,13 @@ private: bool shouldInstrumentWithCall(); + /// Generates IR to load shadow and origin corresponding to bytes [\p + /// Addr, \p Addr + \p Size), where addr has alignment \p + /// InstAlignment, and take the union of each of those shadows. The returned + /// shadow always has primitive type. + std::pair + loadShadowOriginSansLoadTracking(Value *Addr, uint64_t Size, + Align InstAlignment, Instruction *Pos); int NumOriginStores = 0; }; @@ -1110,6 +1136,9 @@ bool DataFlowSanitizer::init(Module &M) { /*isVarArg=*/false); DFSanChainOriginFnTy = FunctionType::get(OriginTy, OriginTy, /*isVarArg=*/false); + Type *DFSanChainOriginIfTaintedArgs[2] = {PrimitiveShadowTy, OriginTy}; + DFSanChainOriginIfTaintedFnTy = FunctionType::get( + OriginTy, DFSanChainOriginIfTaintedArgs, /*isVarArg=*/false); Type *DFSanMaybeStoreOriginArgs[4] = {IntegerType::get(*Ctx, ShadowWidthBits), Int8Ptr, IntptrTy, OriginTy}; DFSanMaybeStoreOriginFnTy = FunctionType::get( @@ -1343,6 +1372,15 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { DFSanChainOriginFn = Mod->getOrInsertFunction("__dfsan_chain_origin", DFSanChainOriginFnTy, AL); } + { + AttributeList AL; + AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt); + AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt); + AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex, + Attribute::ZExt); + DFSanChainOriginIfTaintedFn = Mod->getOrInsertFunction( + "__dfsan_chain_origin_if_tainted", DFSanChainOriginIfTaintedFnTy, AL); + } DFSanMemOriginTransferFn = Mod->getOrInsertFunction( "__dfsan_mem_origin_transfer", DFSanMemOriginTransferFnTy); @@ -1381,6 +1419,8 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { DFSanCmpCallbackFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( DFSanChainOriginFn.getCallee()->stripPointerCasts()); + DFSanRuntimeFunctions.insert( + DFSanChainOriginIfTaintedFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( DFSanMemOriginTransferFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( @@ -2033,6 +2073,11 @@ Align DFSanFunction::getOriginAlign(Align InstAlignment) { bool DFSanFunction::useCallbackLoadLabelAndOrigin(uint64_t Size, Align InstAlignment) { + // When enabling tracking load instructions, we always use + // __dfsan_load_label_and_origin to reduce code size. + if (ClTrackOrigins == 2) + return true; + assert(Size != 0); // * if Size == 1, it is sufficient to load its origin aligned at 4. // * if Size == 2, we assume most cases Addr % 2 == 0, so it is sufficient to @@ -2198,13 +2243,8 @@ Value *DFSanFunction::loadLegacyShadowFast(Value *ShadowAddr, uint64_t Size, return Shadow; } -// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where -// Addr has alignment Align, and take the union of each of those shadows. The -// returned shadow always has primitive type. -std::pair DFSanFunction::loadShadowOrigin(Value *Addr, - uint64_t Size, - Align InstAlignment, - Instruction *Pos) { +std::pair DFSanFunction::loadShadowOriginSansLoadTracking( + Value *Addr, uint64_t Size, Align InstAlignment, Instruction *Pos) { const bool ShouldTrackOrigins = DFS.shouldTrackOrigins(); // Non-escaped loads. @@ -2309,6 +2349,24 @@ std::pair DFSanFunction::loadShadowOrigin(Value *Addr, return {FallbackCall, Origin}; } +std::pair DFSanFunction::loadShadowOrigin(Value *Addr, + uint64_t Size, + Align InstAlignment, + Instruction *Pos) { + Value *PrimitiveShadow, *Origin; + std::tie(PrimitiveShadow, Origin) = + loadShadowOriginSansLoadTracking(Addr, Size, InstAlignment, Pos); + if (DFS.shouldTrackOrigins()) { + if (ClTrackOrigins == 2) { + IRBuilder<> IRB(Pos); + auto *ConstantShadow = dyn_cast(PrimitiveShadow); + if (!ConstantShadow || !ConstantShadow->isZeroValue()) + Origin = updateOriginIfTainted(PrimitiveShadow, Origin, IRB); + } + } + return {PrimitiveShadow, Origin}; +} + static AtomicOrdering addAcquireOrdering(AtomicOrdering AO) { switch (AO) { case AtomicOrdering::NotAtomic: @@ -2380,6 +2438,12 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) { } } +Value *DFSanFunction::updateOriginIfTainted(Value *Shadow, Value *Origin, + IRBuilder<> &IRB) { + assert(DFS.shouldTrackOrigins()); + return IRB.CreateCall(DFS.DFSanChainOriginIfTaintedFn, {Shadow, Origin}); +} + Value *DFSanFunction::updateOrigin(Value *V, IRBuilder<> &IRB) { if (!DFS.shouldTrackOrigins()) return V; diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll index 87395a88ecb3..0fa1569617b2 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll @@ -55,5 +55,6 @@ define void @store(i8* %p) { ; CHECK: declare void @__dfsan_nonzero_label() ; CHECK: declare void @__dfsan_vararg_wrapper(i8*) ; CHECK: declare zeroext i32 @__dfsan_chain_origin(i32 zeroext) +; CHECK: declare zeroext i32 @__dfsan_chain_origin_if_tainted(i[[#SBITS]] zeroext, i32 zeroext) ; CHECK: declare void @__dfsan_mem_origin_transfer(i8*, i8*, i64) ; CHECK: declare void @__dfsan_maybe_store_origin(i[[#SBITS]] zeroext, i8*, i64, i32 zeroext) diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll new file mode 100644 index 000000000000..f16a96aa76cb --- /dev/null +++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll @@ -0,0 +1,32 @@ +; RUN: opt < %s -dfsan -dfsan-track-origins=2 -dfsan-fast-8-labels -S | FileCheck %s +; RUN: opt < %s -dfsan -dfsan-track-origins=2 -dfsan-fast-16-labels -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]] +; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]] + +define i64 @load64(i64* %p) { + ; CHECK-LABEL: @"dfs$load64" + + ; CHECK-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4 + ; CHECK-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] + + ; CHECK-NEXT: %[[#INTP:]] = bitcast i64* %p to i8* + ; CHECK-NEXT: %[[#LABEL_ORIGIN:]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* %[[#INTP]], i64 8) + ; CHECK-NEXT: %[[#LABEL_ORIGIN_H32:]] = lshr i64 %[[#LABEL_ORIGIN]], 32 + ; CHECK-NEXT: %[[#LABEL:]] = trunc i64 %[[#LABEL_ORIGIN_H32]] to i[[#SBITS]] + ; CHECK-NEXT: %[[#ORIGIN:]] = trunc i64 %[[#LABEL_ORIGIN]] to i32 + ; CHECK-NEXT: %[[#ORIGIN_CHAINED:]] = call i32 @__dfsan_chain_origin_if_tainted(i[[#SBITS]] %[[#LABEL]], i32 %[[#ORIGIN]]) + + ; CHECK-NEXT: %[[#LABEL:]] = or i[[#SBITS]] %[[#LABEL]], %[[#PS]] + ; CHECK-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0 + ; CHECK-NEXT: %[[#ORIGIN_SEL:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#ORIGIN_CHAINED]] + + ; CHECK-NEXT: %a = load i64, i64* %p + ; CHECK-NEXT: store i[[#SBITS]] %[[#LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] + ; CHECK-NEXT: store i32 %[[#ORIGIN_SEL]], i32* @__dfsan_retval_origin_tls, align 4 + + %a = load i64, i64* %p + ret i64 %a +}