forked from OSchip/llvm-project
DataFlowSanitizer; Clang changes.
DataFlowSanitizer is a generalised dynamic data flow analysis. Unlike other Sanitizer tools, this tool is not designed to detect a specific class of bugs on its own. Instead, it provides a generic dynamic data flow analysis framework to be used by clients to help detect application-specific issues within their own code. Differential Revision: http://llvm-reviews.chandlerc.com/D966 llvm-svn: 187925
This commit is contained in:
parent
5cbab07d02
commit
c377275a4a
|
@ -0,0 +1,77 @@
|
|||
=================
|
||||
DataFlowSanitizer
|
||||
=================
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
DataFlowSanitizer is a generalised dynamic data flow analysis.
|
||||
|
||||
Unlike other Sanitizer tools, this tool is not designed to detect a
|
||||
specific class of bugs on its own. Instead, it provides a generic
|
||||
dynamic data flow analysis framework to be used by clients to help
|
||||
detect application-specific issues within their own code.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
With no program changes, applying DataFlowSanitizer to a program
|
||||
will not alter its behavior. To use DataFlowSanitizer, the program
|
||||
uses API functions to apply tags to data to cause it to be tracked, and to
|
||||
check the tag of a specific data item. DataFlowSanitizer manages
|
||||
the propagation of tags through the program according to its data flow.
|
||||
|
||||
The APIs are defined in the header file ``sanitizer/dfsan_interface.h``.
|
||||
For further information about each function, please refer to the header
|
||||
file.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
The following program demonstrates label propagation by checking that
|
||||
the correct labels are propagated.
|
||||
|
||||
.. code-block:: c++
|
||||
|
||||
#include <sanitizer/dfsan_interface.h>
|
||||
#include <assert.h>
|
||||
|
||||
int main(void) {
|
||||
int i = 1;
|
||||
dfsan_label i_label = dfsan_create_label("i", 0);
|
||||
dfsan_set_label(i_label, &i, sizeof(i));
|
||||
|
||||
int j = 2;
|
||||
dfsan_label j_label = dfsan_create_label("j", 0);
|
||||
dfsan_set_label(j_label, &j, sizeof(j));
|
||||
|
||||
int k = 3;
|
||||
dfsan_label k_label = dfsan_create_label("k", 0);
|
||||
dfsan_set_label(k_label, &k, sizeof(k));
|
||||
|
||||
dfsan_label ij_label = dfsan_get_label(i + j);
|
||||
assert(dfsan_has_label(ij_label, i_label));
|
||||
assert(dfsan_has_label(ij_label, j_label));
|
||||
assert(!dfsan_has_label(ij_label, k_label));
|
||||
|
||||
dfsan_label ijk_label = dfsan_get_label(i + j + k);
|
||||
assert(dfsan_has_label(ijk_label, i_label));
|
||||
assert(dfsan_has_label(ijk_label, j_label));
|
||||
assert(dfsan_has_label(ijk_label, k_label));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Current status
|
||||
==============
|
||||
|
||||
DataFlowSanitizer is a work in progress, currently under development for
|
||||
x86\_64 Linux.
|
||||
|
||||
Design
|
||||
======
|
||||
|
||||
Please refer to the :doc:`design document<DataFlowSanitizerDesign>`.
|
|
@ -0,0 +1,142 @@
|
|||
DataFlowSanitizer Design Document
|
||||
=================================
|
||||
|
||||
This document sets out the design for DataFlowSanitizer, a general
|
||||
dynamic data flow analysis. Unlike other Sanitizer tools, this tool is
|
||||
not designed to detect a specific class of bugs on its own. Instead,
|
||||
it provides a generic dynamic data flow analysis framework to be used
|
||||
by clients to help detect application-specific issues within their
|
||||
own code.
|
||||
|
||||
DataFlowSanitizer is a program instrumentation which can associate
|
||||
a number of taint labels with any data stored in any memory region
|
||||
accessible by the program. The analysis is dynamic, which means that
|
||||
it operates on a running program, and tracks how the labels propagate
|
||||
through that program. The tool shall support a large (>100) number
|
||||
of labels, such that programs which operate on large numbers of data
|
||||
items may be analysed with each data item being tracked separately.
|
||||
|
||||
Use Cases
|
||||
---------
|
||||
|
||||
This instrumentation can be used as a tool to help monitor how data
|
||||
flows from a program's inputs (sources) to its outputs (sinks).
|
||||
This has applications from a privacy/security perspective in that
|
||||
one can audit how a sensitive data item is used within a program and
|
||||
ensure it isn't exiting the program anywhere it shouldn't be.
|
||||
|
||||
Interface
|
||||
---------
|
||||
|
||||
A number of functions are provided which will create taint labels,
|
||||
attach labels to memory regions and extract the set of labels
|
||||
associated with a specific memory region. These functions are declared
|
||||
in the header file ``sanitizer/dfsan_interface.h``.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/// Creates and returns a base label with the given description and user data.
|
||||
dfsan_label dfsan_create_label(const char *desc, void *userdata);
|
||||
|
||||
/// Sets the label for each address in [addr,addr+size) to \c label.
|
||||
void dfsan_set_label(dfsan_label label, void *addr, size_t size);
|
||||
|
||||
/// Sets the label for each address in [addr,addr+size) to the union of the
|
||||
/// current label for that address and \c label.
|
||||
void dfsan_add_label(dfsan_label label, void *addr, size_t size);
|
||||
|
||||
/// Retrieves the label associated with the given data.
|
||||
///
|
||||
/// The type of 'data' is arbitrary. The function accepts a value of any type,
|
||||
/// which can be truncated or extended (implicitly or explicitly) as necessary.
|
||||
/// The truncation/extension operations will preserve the label of the original
|
||||
/// value.
|
||||
dfsan_label dfsan_get_label(long data);
|
||||
|
||||
/// Retrieves a pointer to the dfsan_label_info struct for the given label.
|
||||
const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label);
|
||||
|
||||
/// Returns whether the given label label contains the label elem.
|
||||
int dfsan_has_label(dfsan_label label, dfsan_label elem);
|
||||
|
||||
/// If the given label label contains a label with the description desc, returns
|
||||
/// that label, else returns 0.
|
||||
dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc);
|
||||
|
||||
Taint label representation
|
||||
--------------------------
|
||||
|
||||
As stated above, the tool must track a large number of taint
|
||||
labels. This poses an implementation challenge, as most multiple-label
|
||||
tainting systems assign one label per bit to shadow storage, and
|
||||
union taint labels using a bitwise or operation. This will not scale
|
||||
to clients which use hundreds or thousands of taint labels, as the
|
||||
label union operation becomes O(n) in the number of supported labels,
|
||||
and data associated with it will quickly dominate the live variable
|
||||
set, causing register spills and hampering performance.
|
||||
|
||||
Instead, a low overhead approach is proposed which is best-case O(log\
|
||||
:sub:`2` n) during execution. The underlying assumption is that
|
||||
the required space of label unions is sparse, which is a reasonable
|
||||
assumption to make given that we are optimizing for the case where
|
||||
applications mostly copy data from one place to another, without often
|
||||
invoking the need for an actual union operation. The representation
|
||||
of a taint label is a 16-bit integer, and new labels are allocated
|
||||
sequentially from a pool. The label identifier 0 is special, and means
|
||||
that the data item is unlabelled.
|
||||
|
||||
When a label union operation is requested at a join point (any
|
||||
arithmetic or logical operation with two or more operands, such as
|
||||
addition), the code checks whether a union is required, whether the
|
||||
same union has been requested before, and whether one union label
|
||||
subsumes the other. If so, it returns the previously allocated union
|
||||
label. If not, it allocates a new union label from the same pool used
|
||||
for new labels.
|
||||
|
||||
Specifically, the instrumentation pass will insert code like this
|
||||
to decide the union label ``lu`` for a pair of labels ``l1``
|
||||
and ``l2``:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
if (l1 == l2)
|
||||
lu = l1;
|
||||
else
|
||||
lu = __dfsan_union(l1, l2);
|
||||
|
||||
The equality comparison is outlined, to provide an early exit in
|
||||
the common cases where the program is processing unlabelled data, or
|
||||
where the two data items have the same label. ``__dfsan_union`` is
|
||||
a runtime library function which performs all other union computation.
|
||||
|
||||
Further optimizations are possible, for example if ``l1`` is known
|
||||
at compile time to be zero (e.g. it is derived from a constant),
|
||||
``l2`` can be used for ``lu``, and vice versa.
|
||||
|
||||
Memory layout and label management
|
||||
----------------------------------
|
||||
|
||||
The following is the current memory layout for Linux/x86\_64:
|
||||
|
||||
+---------------+---------------+--------------------+
|
||||
| Start | End | Use |
|
||||
+===============+===============+====================+
|
||||
| 0x700000008000|0x800000000000 | application memory |
|
||||
+---------------+---------------+--------------------+
|
||||
| 0x200200000000|0x700000008000 | unused |
|
||||
+---------------+---------------+--------------------+
|
||||
| 0x200000000000|0x200200000000 | union table |
|
||||
+---------------+---------------+--------------------+
|
||||
| 0x000000010000|0x200000000000 | shadow memory |
|
||||
+---------------+---------------+--------------------+
|
||||
| 0x000000000000|0x000000010000 | reserved by kernel |
|
||||
+---------------+---------------+--------------------+
|
||||
|
||||
Each byte of application memory corresponds to two bytes of shadow
|
||||
memory, which are used to store its taint label. As for LLVM SSA
|
||||
registers, we have not found it necessary to associate a label with
|
||||
each byte or bit of data, as some other tools do. Instead, labels are
|
||||
associated directly with registers. Loads will result in a union of
|
||||
all shadow labels corresponding to bytes loaded (which most of the
|
||||
time will be short circuited by the initial comparison) and stores will
|
||||
result in a copy of the label to the shadow of all bytes stored to.
|
|
@ -895,6 +895,8 @@ are listed below.
|
|||
used in conjunction with the ``-fsanitize-undefined-trap-on-error``
|
||||
flag. This includes all of the checks listed below other than
|
||||
``unsigned-integer-overflow`` and ``vptr``.
|
||||
- ``-fsanitize=dataflow``: :doc:`DataFlowSanitizer`, a general data
|
||||
flow analysis.
|
||||
|
||||
The following more fine-grained checks are also available:
|
||||
|
||||
|
|
|
@ -77,6 +77,9 @@ SANITIZER("vptr", Vptr)
|
|||
// IntegerSanitizer
|
||||
SANITIZER("unsigned-integer-overflow", UnsignedIntegerOverflow)
|
||||
|
||||
// DataFlowSanitizer
|
||||
SANITIZER("dataflow", DataFlow)
|
||||
|
||||
// -fsanitize=undefined includes all the sanitizers which have low overhead, no
|
||||
// ABI or address space layout implications, and only catch undefined behavior.
|
||||
SANITIZER_GROUP("undefined", Undefined,
|
||||
|
|
|
@ -206,6 +206,11 @@ static void addThreadSanitizerPass(const PassManagerBuilder &Builder,
|
|||
PM.add(createThreadSanitizerPass(CGOpts.SanitizerBlacklistFile));
|
||||
}
|
||||
|
||||
static void addDataFlowSanitizerPass(const PassManagerBuilder &Builder,
|
||||
PassManagerBase &PM) {
|
||||
PM.add(createDataFlowSanitizerPass());
|
||||
}
|
||||
|
||||
void EmitAssemblyHelper::CreatePasses(TargetMachine *TM) {
|
||||
unsigned OptLevel = CodeGenOpts.OptimizationLevel;
|
||||
CodeGenOptions::InliningMethod Inlining = CodeGenOpts.getInlining();
|
||||
|
@ -265,6 +270,13 @@ void EmitAssemblyHelper::CreatePasses(TargetMachine *TM) {
|
|||
addThreadSanitizerPass);
|
||||
}
|
||||
|
||||
if (LangOpts.Sanitize.DataFlow) {
|
||||
PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
|
||||
addDataFlowSanitizerPass);
|
||||
PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
|
||||
addDataFlowSanitizerPass);
|
||||
}
|
||||
|
||||
// Figure out TargetLibraryInfo.
|
||||
Triple TargetTriple(TheModule->getTargetTriple());
|
||||
PMBuilder.LibraryInfo = new TargetLibraryInfo(TargetTriple);
|
||||
|
|
|
@ -37,10 +37,11 @@ class SanitizerArgs {
|
|||
NeedsAsanRt = Address,
|
||||
NeedsTsanRt = Thread,
|
||||
NeedsMsanRt = Memory,
|
||||
NeedsDfsanRt = DataFlow,
|
||||
NeedsLeakDetection = Leak,
|
||||
NeedsUbsanRt = Undefined | Integer,
|
||||
NotAllowedWithTrap = Vptr,
|
||||
HasZeroBaseShadow = Thread | Memory
|
||||
HasZeroBaseShadow = Thread | Memory | DataFlow
|
||||
};
|
||||
unsigned Kind;
|
||||
std::string BlacklistFile;
|
||||
|
@ -66,6 +67,7 @@ class SanitizerArgs {
|
|||
return false;
|
||||
return Kind & NeedsUbsanRt;
|
||||
}
|
||||
bool needsDfsanRt() const { return Kind & NeedsDfsanRt; }
|
||||
|
||||
bool sanitizesVptr() const { return Kind & Vptr; }
|
||||
bool notAllowedWithTrap() const { return Kind & NotAllowedWithTrap; }
|
||||
|
|
|
@ -1860,6 +1860,12 @@ static void addUbsanRTLinux(const ToolChain &TC, const ArgList &Args,
|
|||
addSanitizerRTLinkFlagsLinux(TC, Args, CmdArgs, "ubsan_cxx", false);
|
||||
}
|
||||
|
||||
static void addDfsanRTLinux(const ToolChain &TC, const ArgList &Args,
|
||||
ArgStringList &CmdArgs) {
|
||||
if (!Args.hasArg(options::OPT_shared))
|
||||
addSanitizerRTLinkFlagsLinux(TC, Args, CmdArgs, "dfsan", true);
|
||||
}
|
||||
|
||||
static bool shouldUseFramePointer(const ArgList &Args,
|
||||
const llvm::Triple &Triple) {
|
||||
if (Arg *A = Args.getLastArg(options::OPT_fno_omit_frame_pointer,
|
||||
|
@ -6275,6 +6281,8 @@ void gnutools::Link::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
addMsanRTLinux(getToolChain(), Args, CmdArgs);
|
||||
if (Sanitize.needsLsanRt())
|
||||
addLsanRTLinux(getToolChain(), Args, CmdArgs);
|
||||
if (Sanitize.needsDfsanRt())
|
||||
addDfsanRTLinux(getToolChain(), Args, CmdArgs);
|
||||
|
||||
// The profile runtime also needs access to system libraries.
|
||||
addProfileRTLinux(getToolChain(), Args, CmdArgs);
|
||||
|
|
|
@ -908,6 +908,7 @@ static bool HasFeature(const Preprocessor &PP, const IdentifierInfo *II) {
|
|||
.Case("enumerator_attributes", true)
|
||||
.Case("memory_sanitizer", LangOpts.Sanitize.Memory)
|
||||
.Case("thread_sanitizer", LangOpts.Sanitize.Thread)
|
||||
.Case("dataflow_sanitizer", LangOpts.Sanitize.DataFlow)
|
||||
// Objective-C features
|
||||
.Case("objc_arr", LangOpts.ObjCAutoRefCount) // FIXME: REMOVE?
|
||||
.Case("objc_arc", LangOpts.ObjCAutoRefCount)
|
||||
|
|
|
@ -109,7 +109,8 @@ endif
|
|||
ifeq ($(ARCH),x86_64)
|
||||
RuntimeLibrary.linux.Configs += \
|
||||
full-x86_64.a profile-x86_64.a san-x86_64.a asan-x86_64.a \
|
||||
tsan-x86_64.a msan-x86_64.a ubsan-x86_64.a ubsan_cxx-x86_64.a
|
||||
tsan-x86_64.a msan-x86_64.a ubsan-x86_64.a ubsan_cxx-x86_64.a \
|
||||
dfsan-x86_64.a
|
||||
# We need to build 32-bit ASan/UBsan libraries on 64-bit platform, and add them
|
||||
# to the list of runtime libraries to make
|
||||
# "clang -fsanitize=(address|undefined) -m32" work.
|
||||
|
|
Loading…
Reference in New Issue