2017-12-09 08:21:41 +08:00
|
|
|
//===- HWAddressSanitizer.cpp - detector of uninitialized reads -------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-12-09 08:21:41 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// This file is a part of HWAddressSanitizer, an address sanity checker
|
|
|
|
/// based on tagged addressing.
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
|
2017-12-09 08:21:41 +08:00
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
#include "llvm/ADT/StringExtras.h"
|
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include "llvm/ADT/Triple.h"
|
2019-08-07 06:07:29 +08:00
|
|
|
#include "llvm/BinaryFormat/ELF.h"
|
2017-12-09 08:21:41 +08:00
|
|
|
#include "llvm/IR/Attributes.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constant.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
2019-06-18 07:39:41 +08:00
|
|
|
#include "llvm/IR/DebugInfoMetadata.h"
|
2017-12-09 08:21:41 +08:00
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
|
|
#include "llvm/IR/InlineAsm.h"
|
|
|
|
#include "llvm/IR/InstVisitor.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2018-01-12 06:53:30 +08:00
|
|
|
#include "llvm/IR/MDBuilder.h"
|
2017-12-09 08:21:41 +08:00
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
|
|
|
#include "llvm/Pass.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
2018-01-12 06:53:30 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2017-12-09 08:21:41 +08:00
|
|
|
#include "llvm/Transforms/Instrumentation.h"
|
2018-01-12 06:53:30 +08:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2017-12-09 08:21:41 +08:00
|
|
|
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
2018-01-12 06:53:30 +08:00
|
|
|
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
|
2018-10-23 08:50:40 +08:00
|
|
|
#include <sstream>
|
2017-12-09 08:21:41 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "hwasan"
|
|
|
|
|
|
|
|
static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
|
2019-08-07 06:07:29 +08:00
|
|
|
static const char *const kHwasanNoteName = "hwasan.note";
|
2017-12-09 08:21:41 +08:00
|
|
|
static const char *const kHwasanInitName = "__hwasan_init";
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
static const char *const kHwasanShadowMemoryDynamicAddress =
|
|
|
|
"__hwasan_shadow_memory_dynamic_address";
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
|
|
|
|
static const size_t kNumberOfAccessSizes = 5;
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
static const size_t kDefaultShadowScale = 4;
|
|
|
|
static const uint64_t kDynamicShadowSentinel =
|
|
|
|
std::numeric_limits<uint64_t>::max();
|
2017-12-13 09:16:34 +08:00
|
|
|
static const unsigned kPointerTagShift = 56;
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
static const unsigned kShadowBaseAlignment = 32;
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
|
|
|
|
"hwasan-memory-access-callback-prefix",
|
|
|
|
cl::desc("Prefix for memory access callbacks"), cl::Hidden,
|
|
|
|
cl::init("__hwasan_"));
|
|
|
|
|
2017-12-13 09:16:34 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
ClInstrumentWithCalls("hwasan-instrument-with-calls",
|
|
|
|
cl::desc("instrument reads and writes with callbacks"),
|
|
|
|
cl::Hidden, cl::init(false));
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
static cl::opt<bool> ClInstrumentReads("hwasan-instrument-reads",
|
|
|
|
cl::desc("instrument read instructions"),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
|
|
|
static cl::opt<bool> ClInstrumentWrites(
|
|
|
|
"hwasan-instrument-writes", cl::desc("instrument write instructions"),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
|
|
|
static cl::opt<bool> ClInstrumentAtomics(
|
|
|
|
"hwasan-instrument-atomics",
|
|
|
|
cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
|
|
|
|
cl::init(true));
|
|
|
|
|
2017-12-21 03:05:44 +08:00
|
|
|
static cl::opt<bool> ClRecover(
|
|
|
|
"hwasan-recover",
|
|
|
|
cl::desc("Enable recovery mode (continue-after-error)."),
|
|
|
|
cl::Hidden, cl::init(false));
|
|
|
|
|
2018-01-12 06:53:30 +08:00
|
|
|
static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
|
|
|
|
cl::desc("instrument stack (allocas)"),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
2018-06-30 04:20:17 +08:00
|
|
|
static cl::opt<bool> ClUARRetagToZero(
|
|
|
|
"hwasan-uar-retag-to-zero",
|
|
|
|
cl::desc("Clear alloca tags before returning from the function to allow "
|
|
|
|
"non-instrumented and instrumented function calls mix. When set "
|
|
|
|
"to false, allocas are retagged before returning from the "
|
|
|
|
"function to detect use after return."),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
2018-01-13 09:32:15 +08:00
|
|
|
static cl::opt<bool> ClGenerateTagsWithCalls(
|
|
|
|
"hwasan-generate-tags-with-calls",
|
|
|
|
cl::desc("generate new tags with runtime library calls"), cl::Hidden,
|
|
|
|
cl::init(false));
|
|
|
|
|
2019-08-07 06:07:29 +08:00
|
|
|
static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
|
|
|
|
cl::Hidden, cl::init(false));
|
|
|
|
|
2018-04-05 04:44:59 +08:00
|
|
|
static cl::opt<int> ClMatchAllTag(
|
|
|
|
"hwasan-match-all-tag",
|
2018-04-21 04:04:04 +08:00
|
|
|
cl::desc("don't report bad accesses via pointers with this tag"),
|
|
|
|
cl::Hidden, cl::init(-1));
|
2018-04-05 04:44:59 +08:00
|
|
|
|
2018-01-18 07:24:38 +08:00
|
|
|
static cl::opt<bool> ClEnableKhwasan(
|
2018-04-21 04:04:04 +08:00
|
|
|
"hwasan-kernel",
|
|
|
|
cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
|
2018-01-18 07:24:38 +08:00
|
|
|
cl::Hidden, cl::init(false));
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
// These flags allow to change the shadow mapping and control how shadow memory
|
|
|
|
// is accessed. The shadow mapping looks like:
|
|
|
|
// Shadow = (Mem >> scale) + offset
|
|
|
|
|
2019-04-24 10:40:20 +08:00
|
|
|
static cl::opt<uint64_t>
|
|
|
|
ClMappingOffset("hwasan-mapping-offset",
|
|
|
|
cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"),
|
|
|
|
cl::Hidden, cl::init(0));
|
2018-04-21 04:04:04 +08:00
|
|
|
|
2018-08-11 00:21:37 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
ClWithIfunc("hwasan-with-ifunc",
|
|
|
|
cl::desc("Access dynamic shadow through an ifunc global on "
|
|
|
|
"platforms that support this"),
|
|
|
|
cl::Hidden, cl::init(false));
|
2018-09-25 07:03:34 +08:00
|
|
|
|
|
|
|
static cl::opt<bool> ClWithTls(
|
|
|
|
"hwasan-with-tls",
|
|
|
|
cl::desc("Access dynamic shadow through an thread-local pointer on "
|
|
|
|
"platforms that support this"),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
ClRecordStackHistory("hwasan-record-stack-history",
|
|
|
|
cl::desc("Record stack frames with tagged allocations "
|
|
|
|
"in a thread-local ring buffer"),
|
|
|
|
cl::Hidden, cl::init(true));
|
2018-12-20 17:04:33 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics",
|
|
|
|
cl::desc("instrument memory intrinsics"),
|
2018-12-25 00:02:48 +08:00
|
|
|
cl::Hidden, cl::init(true));
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
|
2019-05-17 07:54:41 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
ClInstrumentLandingPads("hwasan-instrument-landing-pads",
|
|
|
|
cl::desc("instrument landing pads"), cl::Hidden,
|
|
|
|
cl::init(true));
|
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
|
|
|
|
cl::desc("inline all checks"),
|
|
|
|
cl::Hidden, cl::init(false));
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
namespace {
|
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// An instrumentation pass implementing detection of addressability bugs
|
2017-12-09 08:21:41 +08:00
|
|
|
/// using tagged pointers.
|
2019-05-15 05:17:21 +08:00
|
|
|
class HWAddressSanitizer {
|
2017-12-09 08:21:41 +08:00
|
|
|
public:
|
2019-05-15 05:17:21 +08:00
|
|
|
explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
|
2019-08-07 06:07:29 +08:00
|
|
|
bool Recover = false) : M(M) {
|
2018-04-14 02:05:21 +08:00
|
|
|
this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
|
|
|
|
this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
|
|
|
|
ClEnableKhwasan : CompileKernel;
|
2017-12-09 08:21:41 +08:00
|
|
|
|
2019-08-07 06:07:29 +08:00
|
|
|
initializeModule();
|
2019-05-15 05:17:21 +08:00
|
|
|
}
|
2017-12-09 08:21:41 +08:00
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
bool sanitizeFunction(Function &F);
|
2019-08-07 06:07:29 +08:00
|
|
|
void initializeModule();
|
2017-12-09 08:21:41 +08:00
|
|
|
|
|
|
|
void initializeCallbacks(Module &M);
|
2018-04-21 04:04:04 +08:00
|
|
|
|
2019-01-24 06:39:11 +08:00
|
|
|
Value *getDynamicShadowIfunc(IRBuilder<> &IRB);
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *getDynamicShadowNonTls(IRBuilder<> &IRB);
|
2018-04-21 04:04:04 +08:00
|
|
|
|
2018-03-24 01:57:54 +08:00
|
|
|
void untagPointerOperand(Instruction *I, Value *Addr);
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Value *shadowBase();
|
|
|
|
Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
|
|
|
|
void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
|
2017-12-13 09:16:34 +08:00
|
|
|
unsigned AccessSizeIndex,
|
|
|
|
Instruction *InsertBefore);
|
2018-12-20 17:04:33 +08:00
|
|
|
void instrumentMemIntrinsic(MemIntrinsic *MI);
|
2017-12-09 08:21:41 +08:00
|
|
|
bool instrumentMemAccess(Instruction *I);
|
|
|
|
Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
|
|
|
|
uint64_t *TypeSize, unsigned *Alignment,
|
|
|
|
Value **MaybeMask);
|
|
|
|
|
2018-01-12 06:53:30 +08:00
|
|
|
bool isInterestingAlloca(const AllocaInst &AI);
|
2019-07-10 04:22:36 +08:00
|
|
|
bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
|
2018-02-09 08:59:10 +08:00
|
|
|
Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
|
2018-02-22 03:52:23 +08:00
|
|
|
Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
|
2019-06-18 07:39:41 +08:00
|
|
|
bool instrumentStack(
|
|
|
|
SmallVectorImpl<AllocaInst *> &Allocas,
|
|
|
|
DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap,
|
|
|
|
SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
|
2019-06-28 07:24:07 +08:00
|
|
|
Value *readRegister(IRBuilder<> &IRB, StringRef Name);
|
2019-05-17 07:54:41 +08:00
|
|
|
bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
|
2018-01-13 09:32:15 +08:00
|
|
|
Value *getNextTagWithCall(IRBuilder<> &IRB);
|
|
|
|
Value *getStackBaseTag(IRBuilder<> &IRB);
|
|
|
|
Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
|
|
|
|
unsigned AllocaNo);
|
|
|
|
Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
|
2018-01-12 06:53:30 +08:00
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
|
2019-06-18 07:39:51 +08:00
|
|
|
void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
|
2018-09-25 07:03:34 +08:00
|
|
|
|
2019-08-07 06:07:29 +08:00
|
|
|
void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
|
|
|
|
void instrumentGlobals();
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
private:
|
|
|
|
LLVMContext *C;
|
2019-08-07 06:07:29 +08:00
|
|
|
Module &M;
|
2018-03-24 01:57:54 +08:00
|
|
|
Triple TargetTriple;
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
|
2019-05-17 07:54:41 +08:00
|
|
|
FunctionCallee HWAsanHandleVfork;
|
2018-03-24 01:57:54 +08:00
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
/// This struct defines the shadow mapping using the rule:
|
|
|
|
/// shadow = (mem >> Scale) + Offset.
|
|
|
|
/// If InGlobal is true, then
|
|
|
|
/// extern char __hwasan_shadow[];
|
|
|
|
/// shadow = (mem >> Scale) + &__hwasan_shadow
|
2018-09-25 07:03:34 +08:00
|
|
|
/// If InTls is true, then
|
|
|
|
/// extern char *__hwasan_tls;
|
2018-10-23 08:50:40 +08:00
|
|
|
/// shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
|
2018-04-21 04:04:04 +08:00
|
|
|
struct ShadowMapping {
|
|
|
|
int Scale;
|
|
|
|
uint64_t Offset;
|
|
|
|
bool InGlobal;
|
2018-09-25 07:03:34 +08:00
|
|
|
bool InTls;
|
2018-04-21 04:04:04 +08:00
|
|
|
|
|
|
|
void init(Triple &TargetTriple);
|
2019-08-07 06:07:29 +08:00
|
|
|
unsigned getObjectAlignment() const { return 1U << Scale; }
|
2018-04-21 04:04:04 +08:00
|
|
|
};
|
|
|
|
ShadowMapping Mapping;
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
Type *IntptrTy;
|
2018-08-15 08:39:35 +08:00
|
|
|
Type *Int8PtrTy;
|
2018-01-12 06:53:30 +08:00
|
|
|
Type *Int8Ty;
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Type *Int32Ty;
|
2019-08-07 06:07:29 +08:00
|
|
|
Type *Int64Ty = Type::getInt64Ty(M.getContext());
|
2017-12-09 08:21:41 +08:00
|
|
|
|
2018-04-14 02:05:21 +08:00
|
|
|
bool CompileKernel;
|
2017-12-21 03:05:44 +08:00
|
|
|
bool Recover;
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
Function *HwasanCtorFunction;
|
|
|
|
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
|
|
|
|
FunctionCallee HwasanMemoryAccessCallbackSized[2];
|
2018-01-12 06:53:30 +08:00
|
|
|
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
FunctionCallee HwasanTagMemoryFunc;
|
|
|
|
FunctionCallee HwasanGenerateTagFunc;
|
|
|
|
FunctionCallee HwasanThreadEnterFunc;
|
2018-04-21 04:04:04 +08:00
|
|
|
|
|
|
|
Constant *ShadowGlobal;
|
|
|
|
|
|
|
|
Value *LocalDynamicShadow = nullptr;
|
2019-06-18 07:39:51 +08:00
|
|
|
Value *StackBaseTag = nullptr;
|
2018-09-25 07:03:34 +08:00
|
|
|
GlobalValue *ThreadPtrGlobal = nullptr;
|
2017-12-09 08:21:41 +08:00
|
|
|
};
|
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
class HWAddressSanitizerLegacyPass : public FunctionPass {
|
|
|
|
public:
|
|
|
|
// Pass identification, replacement for typeid.
|
|
|
|
static char ID;
|
|
|
|
|
|
|
|
explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
|
|
|
|
bool Recover = false)
|
|
|
|
: FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {}
|
|
|
|
|
|
|
|
StringRef getPassName() const override { return "HWAddressSanitizer"; }
|
|
|
|
|
2019-07-18 05:45:19 +08:00
|
|
|
bool doInitialization(Module &M) override {
|
|
|
|
HWASan = llvm::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
bool runOnFunction(Function &F) override {
|
2019-07-18 05:45:19 +08:00
|
|
|
return HWASan->sanitizeFunction(F);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool doFinalization(Module &M) override {
|
|
|
|
HWASan.reset();
|
|
|
|
return false;
|
2019-05-15 05:17:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2019-07-18 05:45:19 +08:00
|
|
|
std::unique_ptr<HWAddressSanitizer> HWASan;
|
2019-05-15 05:17:21 +08:00
|
|
|
bool CompileKernel;
|
|
|
|
bool Recover;
|
|
|
|
};
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
} // end anonymous namespace
|
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
char HWAddressSanitizerLegacyPass::ID = 0;
|
2017-12-09 08:21:41 +08:00
|
|
|
|
|
|
|
INITIALIZE_PASS_BEGIN(
|
2019-05-15 05:17:21 +08:00
|
|
|
HWAddressSanitizerLegacyPass, "hwasan",
|
2018-04-21 04:04:04 +08:00
|
|
|
"HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
|
|
|
|
false)
|
2017-12-09 08:21:41 +08:00
|
|
|
INITIALIZE_PASS_END(
|
2019-05-15 05:17:21 +08:00
|
|
|
HWAddressSanitizerLegacyPass, "hwasan",
|
2018-04-21 04:04:04 +08:00
|
|
|
"HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
|
|
|
|
false)
|
2017-12-09 08:21:41 +08:00
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
FunctionPass *llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel,
|
|
|
|
bool Recover) {
|
2018-04-14 02:05:21 +08:00
|
|
|
assert(!CompileKernel || Recover);
|
2019-05-15 05:17:21 +08:00
|
|
|
return new HWAddressSanitizerLegacyPass(CompileKernel, Recover);
|
|
|
|
}
|
|
|
|
|
|
|
|
HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover)
|
|
|
|
: CompileKernel(CompileKernel), Recover(Recover) {}
|
|
|
|
|
2019-07-18 05:45:19 +08:00
|
|
|
PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
|
|
|
|
ModuleAnalysisManager &MAM) {
|
|
|
|
HWAddressSanitizer HWASan(M, CompileKernel, Recover);
|
|
|
|
bool Modified = false;
|
|
|
|
for (Function &F : M)
|
|
|
|
Modified |= HWASan.sanitizeFunction(F);
|
|
|
|
if (Modified)
|
2019-05-15 05:17:21 +08:00
|
|
|
return PreservedAnalyses::none();
|
|
|
|
return PreservedAnalyses::all();
|
2017-12-09 08:21:41 +08:00
|
|
|
}
|
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Module-level initialization.
|
2017-12-09 08:21:41 +08:00
|
|
|
///
|
|
|
|
/// inserts a call to __hwasan_init to the module's constructor list.
|
2019-08-07 06:07:29 +08:00
|
|
|
void HWAddressSanitizer::initializeModule() {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
|
2017-12-09 08:21:41 +08:00
|
|
|
auto &DL = M.getDataLayout();
|
|
|
|
|
2018-03-24 01:57:54 +08:00
|
|
|
TargetTriple = Triple(M.getTargetTriple());
|
2017-12-09 08:21:41 +08:00
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
Mapping.init(TargetTriple);
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
C = &(M.getContext());
|
|
|
|
IRBuilder<> IRB(*C);
|
|
|
|
IntptrTy = IRB.getIntPtrTy(DL);
|
2018-08-15 08:39:35 +08:00
|
|
|
Int8PtrTy = IRB.getInt8PtrTy();
|
2018-01-12 06:53:30 +08:00
|
|
|
Int8Ty = IRB.getInt8Ty();
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Int32Ty = IRB.getInt32Ty();
|
2017-12-09 08:21:41 +08:00
|
|
|
|
2018-01-18 22:19:04 +08:00
|
|
|
HwasanCtorFunction = nullptr;
|
2018-04-14 02:05:21 +08:00
|
|
|
if (!CompileKernel) {
|
2018-01-18 07:24:38 +08:00
|
|
|
std::tie(HwasanCtorFunction, std::ignore) =
|
2019-05-15 05:17:21 +08:00
|
|
|
getOrCreateSanitizerCtorAndInitFunctions(
|
|
|
|
M, kHwasanModuleCtorName, kHwasanInitName,
|
|
|
|
/*InitArgTypes=*/{},
|
|
|
|
/*InitArgs=*/{},
|
|
|
|
// This callback is invoked when the functions are created the first
|
|
|
|
// time. Hook them into the global ctors list in that case:
|
|
|
|
[&](Function *Ctor, FunctionCallee) {
|
|
|
|
Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
|
|
|
|
Ctor->setComdat(CtorComdat);
|
|
|
|
appendToGlobalCtors(M, Ctor, 0, Ctor);
|
|
|
|
});
|
2019-08-07 06:07:29 +08:00
|
|
|
|
|
|
|
// Older versions of Android do not have the required runtime support for
|
|
|
|
// global instrumentation. On other platforms we currently require using the
|
|
|
|
// latest version of the runtime.
|
|
|
|
bool InstrumentGlobals =
|
|
|
|
!TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
|
|
|
|
if (ClGlobals.getNumOccurrences())
|
|
|
|
InstrumentGlobals = ClGlobals;
|
|
|
|
if (InstrumentGlobals)
|
|
|
|
instrumentGlobals();
|
2018-10-23 08:50:40 +08:00
|
|
|
}
|
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
if (!TargetTriple.isAndroid()) {
|
|
|
|
Constant *C = M.getOrInsertGlobal("__hwasan_tls", IntptrTy, [&] {
|
2019-07-16 12:46:31 +08:00
|
|
|
auto *GV = new GlobalVariable(M, IntptrTy, /*isConstant=*/false,
|
2019-05-15 05:17:21 +08:00
|
|
|
GlobalValue::ExternalLinkage, nullptr,
|
|
|
|
"__hwasan_tls", nullptr,
|
|
|
|
GlobalVariable::InitialExecTLSModel);
|
|
|
|
appendToCompilerUsed(M, GV);
|
|
|
|
return GV;
|
|
|
|
});
|
|
|
|
ThreadPtrGlobal = cast<GlobalVariable>(C);
|
|
|
|
}
|
2017-12-09 08:21:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void HWAddressSanitizer::initializeCallbacks(Module &M) {
|
|
|
|
IRBuilder<> IRB(*C);
|
|
|
|
for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
|
|
|
|
const std::string TypeStr = AccessIsWrite ? "store" : "load";
|
2017-12-21 03:05:44 +08:00
|
|
|
const std::string EndingStr = Recover ? "_noabort" : "";
|
2017-12-09 08:21:41 +08:00
|
|
|
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
HwasanMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction(
|
|
|
|
ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
|
|
|
|
FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false));
|
2017-12-09 08:21:41 +08:00
|
|
|
|
|
|
|
for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
|
|
|
|
AccessSizeIndex++) {
|
|
|
|
HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
M.getOrInsertFunction(
|
2017-12-09 08:21:41 +08:00
|
|
|
ClMemoryAccessCallbackPrefix + TypeStr +
|
2017-12-21 03:05:44 +08:00
|
|
|
itostr(1ULL << AccessSizeIndex) + EndingStr,
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false));
|
2017-12-09 08:21:41 +08:00
|
|
|
}
|
|
|
|
}
|
2018-01-12 06:53:30 +08:00
|
|
|
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
HwasanTagMemoryFunc = M.getOrInsertFunction(
|
|
|
|
"__hwasan_tag_memory", IRB.getVoidTy(), Int8PtrTy, Int8Ty, IntptrTy);
|
|
|
|
HwasanGenerateTagFunc =
|
|
|
|
M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty);
|
2018-04-21 04:04:04 +08:00
|
|
|
|
2019-01-24 06:39:11 +08:00
|
|
|
ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
|
|
|
|
ArrayType::get(IRB.getInt8Ty(), 0));
|
2018-12-20 17:04:33 +08:00
|
|
|
|
|
|
|
const std::string MemIntrinCallbackPrefix =
|
|
|
|
CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
|
|
|
|
IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
|
|
|
|
IRB.getInt8PtrTy(), IntptrTy);
|
|
|
|
HWAsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
|
|
|
|
IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
|
|
|
|
IRB.getInt8PtrTy(), IntptrTy);
|
|
|
|
HWAsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
|
|
|
|
IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
|
|
|
|
IRB.getInt32Ty(), IntptrTy);
|
|
|
|
|
2019-05-17 07:54:41 +08:00
|
|
|
HWAsanHandleVfork =
|
|
|
|
M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
|
|
|
|
|
[opaque pointer types] Add a FunctionCallee wrapper type, and use it.
Recommit r352791 after tweaking DerivedTypes.h slightly, so that gcc
doesn't choke on it, hopefully.
Original Message:
The FunctionCallee type is effectively a {FunctionType*,Value*} pair,
and is a useful convenience to enable code to continue passing the
result of getOrInsertFunction() through to EmitCall, even once pointer
types lose their pointee-type.
Then:
- update the CallInst/InvokeInst instruction creation functions to
take a Callee,
- modify getOrInsertFunction to return FunctionCallee, and
- update all callers appropriately.
One area of particular note is the change to the sanitizer
code. Previously, they had been casting the result of
`getOrInsertFunction` to a `Function*` via
`checkSanitizerInterfaceFunction`, and storing that. That would report
an error if someone had already inserted a function declaraction with
a mismatching signature.
However, in general, LLVM allows for such mismatches, as
`getOrInsertFunction` will automatically insert a bitcast if
needed. As part of this cleanup, cause the sanitizer code to do the
same. (It will call its functions using the expected signature,
however they may have been declared.)
Finally, in a small number of locations, callers of
`getOrInsertFunction` actually were expecting/requiring that a brand
new function was being created. In such cases, I've switched them to
Function::Create instead.
Differential Revision: https://reviews.llvm.org/D57315
llvm-svn: 352827
2019-02-01 10:28:03 +08:00
|
|
|
HwasanThreadEnterFunc =
|
|
|
|
M.getOrInsertFunction("__hwasan_thread_enter", IRB.getVoidTy());
|
2018-04-21 04:04:04 +08:00
|
|
|
}
|
|
|
|
|
2019-01-24 06:39:11 +08:00
|
|
|
Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
|
|
|
|
// An empty inline asm with input reg == output reg.
|
|
|
|
// An opaque no-op cast, basically.
|
|
|
|
InlineAsm *Asm = InlineAsm::get(
|
|
|
|
FunctionType::get(Int8PtrTy, {ShadowGlobal->getType()}, false),
|
|
|
|
StringRef(""), StringRef("=r,0"),
|
|
|
|
/*hasSideEffects=*/false);
|
|
|
|
return IRB.CreateCall(Asm, {ShadowGlobal}, ".hwasan.shadow");
|
|
|
|
}
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *HWAddressSanitizer::getDynamicShadowNonTls(IRBuilder<> &IRB) {
|
2018-04-21 04:04:04 +08:00
|
|
|
// Generate code only when dynamic addressing is needed.
|
|
|
|
if (Mapping.Offset != kDynamicShadowSentinel)
|
2018-09-25 07:03:34 +08:00
|
|
|
return nullptr;
|
2018-04-21 04:04:04 +08:00
|
|
|
|
|
|
|
if (Mapping.InGlobal) {
|
2019-01-24 06:39:11 +08:00
|
|
|
return getDynamicShadowIfunc(IRB);
|
2018-04-21 04:04:04 +08:00
|
|
|
} else {
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *GlobalDynamicAddress =
|
|
|
|
IRB.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal(
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
kHwasanShadowMemoryDynamicAddress, Int8PtrTy);
|
2019-02-02 04:44:24 +08:00
|
|
|
return IRB.CreateLoad(Int8PtrTy, GlobalDynamicAddress);
|
2018-04-21 04:04:04 +08:00
|
|
|
}
|
2017-12-09 08:21:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
|
2018-03-24 01:57:54 +08:00
|
|
|
bool *IsWrite,
|
|
|
|
uint64_t *TypeSize,
|
|
|
|
unsigned *Alignment,
|
|
|
|
Value **MaybeMask) {
|
2017-12-09 08:21:41 +08:00
|
|
|
// Skip memory accesses inserted by another instrumentation.
|
|
|
|
if (I->getMetadata("nosanitize")) return nullptr;
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
// Do not instrument the load fetching the dynamic shadow address.
|
|
|
|
if (LocalDynamicShadow == I)
|
|
|
|
return nullptr;
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
Value *PtrOperand = nullptr;
|
|
|
|
const DataLayout &DL = I->getModule()->getDataLayout();
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
|
|
|
|
if (!ClInstrumentReads) return nullptr;
|
|
|
|
*IsWrite = false;
|
|
|
|
*TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
|
|
|
|
*Alignment = LI->getAlignment();
|
|
|
|
PtrOperand = LI->getPointerOperand();
|
|
|
|
} else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
|
|
|
|
if (!ClInstrumentWrites) return nullptr;
|
|
|
|
*IsWrite = true;
|
|
|
|
*TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
|
|
|
|
*Alignment = SI->getAlignment();
|
|
|
|
PtrOperand = SI->getPointerOperand();
|
|
|
|
} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
|
|
|
|
if (!ClInstrumentAtomics) return nullptr;
|
|
|
|
*IsWrite = true;
|
|
|
|
*TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
|
|
|
|
*Alignment = 0;
|
|
|
|
PtrOperand = RMW->getPointerOperand();
|
|
|
|
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
|
|
|
|
if (!ClInstrumentAtomics) return nullptr;
|
|
|
|
*IsWrite = true;
|
|
|
|
*TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
|
|
|
|
*Alignment = 0;
|
|
|
|
PtrOperand = XCHG->getPointerOperand();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PtrOperand) {
|
2018-04-21 04:04:04 +08:00
|
|
|
// Do not instrument accesses from different address spaces; we cannot deal
|
2017-12-09 08:21:41 +08:00
|
|
|
// with them.
|
|
|
|
Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
|
|
|
|
if (PtrTy->getPointerAddressSpace() != 0)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Ignore swifterror addresses.
|
|
|
|
// swifterror memory addresses are mem2reg promoted by instruction
|
|
|
|
// selection. As such they cannot have regular uses like an instrumentation
|
|
|
|
// function and it makes no sense to track them as memory.
|
|
|
|
if (PtrOperand->isSwiftError())
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return PtrOperand;
|
|
|
|
}
|
|
|
|
|
2018-03-24 01:57:54 +08:00
|
|
|
static unsigned getPointerOperandIndex(Instruction *I) {
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
|
|
|
return LI->getPointerOperandIndex();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
|
|
|
return SI->getPointerOperandIndex();
|
|
|
|
if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
|
|
|
|
return RMW->getPointerOperandIndex();
|
|
|
|
if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
|
|
|
|
return XCHG->getPointerOperandIndex();
|
|
|
|
report_fatal_error("Unexpected instruction");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
|
|
|
|
size_t Res = countTrailingZeros(TypeSize / 8);
|
|
|
|
assert(Res < kNumberOfAccessSizes);
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
|
2018-03-24 01:57:54 +08:00
|
|
|
void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
|
|
|
|
if (TargetTriple.isAArch64())
|
|
|
|
return;
|
|
|
|
|
|
|
|
IRBuilder<> IRB(I);
|
|
|
|
Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
|
|
|
|
Value *UntaggedPtr =
|
|
|
|
IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
|
|
|
|
I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
|
|
|
|
}
|
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Value *HWAddressSanitizer::shadowBase() {
|
|
|
|
if (LocalDynamicShadow)
|
|
|
|
return LocalDynamicShadow;
|
|
|
|
return ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, Mapping.Offset),
|
|
|
|
Int8PtrTy);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
|
2018-04-21 04:04:04 +08:00
|
|
|
// Mem >> Scale
|
|
|
|
Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
|
|
|
|
if (Mapping.Offset == 0)
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
return IRB.CreateIntToPtr(Shadow, Int8PtrTy);
|
2018-04-21 04:04:04 +08:00
|
|
|
// (Mem >> Scale) + Offset
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
return IRB.CreateGEP(Int8Ty, shadowBase(), Shadow);
|
2018-04-21 04:04:04 +08:00
|
|
|
}
|
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
|
2017-12-13 09:16:34 +08:00
|
|
|
unsigned AccessSizeIndex,
|
|
|
|
Instruction *InsertBefore) {
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
const int64_t AccessInfo = Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex;
|
2017-12-13 09:16:34 +08:00
|
|
|
IRBuilder<> IRB(InsertBefore);
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
|
|
|
|
if (!ClInlineAllChecks && TargetTriple.isAArch64() &&
|
|
|
|
TargetTriple.isOSBinFormatELF() && !Recover) {
|
|
|
|
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
|
|
|
|
IRB.CreateCall(
|
|
|
|
Intrinsic::getDeclaration(M, Intrinsic::hwasan_check_memaccess),
|
|
|
|
{shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy);
|
2018-03-24 01:57:54 +08:00
|
|
|
Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
|
|
|
|
IRB.getInt8Ty());
|
2018-02-22 03:52:23 +08:00
|
|
|
Value *AddrLong = untagPointer(IRB, PtrLong);
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Value *Shadow = memToShadow(AddrLong, IRB);
|
2019-02-02 04:44:24 +08:00
|
|
|
Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow);
|
2017-12-13 09:16:34 +08:00
|
|
|
Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
|
|
|
|
|
2018-04-14 02:05:21 +08:00
|
|
|
int matchAllTag = ClMatchAllTag.getNumOccurrences() > 0 ?
|
|
|
|
ClMatchAllTag : (CompileKernel ? 0xFF : -1);
|
|
|
|
if (matchAllTag != -1) {
|
2018-04-05 04:44:59 +08:00
|
|
|
Value *TagNotIgnored = IRB.CreateICmpNE(PtrTag,
|
2018-04-14 02:05:21 +08:00
|
|
|
ConstantInt::get(PtrTag->getType(), matchAllTag));
|
2018-04-05 04:44:59 +08:00
|
|
|
TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
|
|
|
|
}
|
|
|
|
|
2018-10-15 17:34:05 +08:00
|
|
|
Instruction *CheckTerm =
|
2019-07-10 04:22:36 +08:00
|
|
|
SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
|
2017-12-13 09:16:34 +08:00
|
|
|
MDBuilder(*C).createBranchWeights(1, 100000));
|
|
|
|
|
|
|
|
IRB.SetInsertPoint(CheckTerm);
|
2019-07-10 04:22:36 +08:00
|
|
|
Value *OutOfShortGranuleTagRange =
|
|
|
|
IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
|
|
|
|
Instruction *CheckFailTerm =
|
|
|
|
SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
|
|
|
|
MDBuilder(*C).createBranchWeights(1, 100000));
|
|
|
|
|
|
|
|
IRB.SetInsertPoint(CheckTerm);
|
|
|
|
Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
|
|
|
|
PtrLowBits = IRB.CreateAdd(
|
|
|
|
PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
|
|
|
|
Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
|
|
|
|
SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
|
|
|
|
MDBuilder(*C).createBranchWeights(1, 100000),
|
|
|
|
nullptr, nullptr, CheckFailTerm->getParent());
|
|
|
|
|
|
|
|
IRB.SetInsertPoint(CheckTerm);
|
|
|
|
Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
|
|
|
|
InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
|
|
|
|
Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
|
|
|
|
Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
|
|
|
|
SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
|
|
|
|
MDBuilder(*C).createBranchWeights(1, 100000),
|
|
|
|
nullptr, nullptr, CheckFailTerm->getParent());
|
|
|
|
|
|
|
|
IRB.SetInsertPoint(CheckFailTerm);
|
2018-03-24 01:57:54 +08:00
|
|
|
InlineAsm *Asm;
|
|
|
|
switch (TargetTriple.getArch()) {
|
|
|
|
case Triple::x86_64:
|
|
|
|
// The signal handler will find the data address in rdi.
|
|
|
|
Asm = InlineAsm::get(
|
|
|
|
FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
|
|
|
|
"int3\nnopl " + itostr(0x40 + AccessInfo) + "(%rax)",
|
|
|
|
"{rdi}",
|
|
|
|
/*hasSideEffects=*/true);
|
|
|
|
break;
|
|
|
|
case Triple::aarch64:
|
|
|
|
case Triple::aarch64_be:
|
|
|
|
// The signal handler will find the data address in x0.
|
|
|
|
Asm = InlineAsm::get(
|
|
|
|
FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
|
|
|
|
"brk #" + itostr(0x900 + AccessInfo),
|
|
|
|
"{x0}",
|
|
|
|
/*hasSideEffects=*/true);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
report_fatal_error("unsupported architecture");
|
|
|
|
}
|
2017-12-13 09:16:34 +08:00
|
|
|
IRB.CreateCall(Asm, PtrLong);
|
2019-07-10 04:22:36 +08:00
|
|
|
if (Recover)
|
|
|
|
cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
|
2017-12-13 09:16:34 +08:00
|
|
|
}
|
|
|
|
|
2018-12-20 17:04:33 +08:00
|
|
|
void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
|
|
|
|
IRBuilder<> IRB(MI);
|
|
|
|
if (isa<MemTransferInst>(MI)) {
|
|
|
|
IRB.CreateCall(
|
|
|
|
isa<MemMoveInst>(MI) ? HWAsanMemmove : HWAsanMemcpy,
|
|
|
|
{IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
|
|
|
|
IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
|
|
|
|
IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
|
|
|
|
} else if (isa<MemSetInst>(MI)) {
|
|
|
|
IRB.CreateCall(
|
|
|
|
HWAsanMemset,
|
|
|
|
{IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
|
|
|
|
IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
|
|
|
|
IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
|
|
|
|
}
|
|
|
|
MI->eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
|
2017-12-09 08:21:41 +08:00
|
|
|
bool IsWrite = false;
|
|
|
|
unsigned Alignment = 0;
|
|
|
|
uint64_t TypeSize = 0;
|
|
|
|
Value *MaybeMask = nullptr;
|
2018-12-20 17:04:33 +08:00
|
|
|
|
|
|
|
if (ClInstrumentMemIntrinsics && isa<MemIntrinsic>(I)) {
|
|
|
|
instrumentMemIntrinsic(cast<MemIntrinsic>(I));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
Value *Addr =
|
|
|
|
isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
|
|
|
|
|
|
|
|
if (!Addr)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (MaybeMask)
|
|
|
|
return false; //FIXME
|
|
|
|
|
|
|
|
IRBuilder<> IRB(I);
|
|
|
|
if (isPowerOf2_64(TypeSize) &&
|
2017-12-13 09:16:34 +08:00
|
|
|
(TypeSize / 8 <= (1UL << (kNumberOfAccessSizes - 1))) &&
|
2018-04-21 04:04:04 +08:00
|
|
|
(Alignment >= (1UL << Mapping.Scale) || Alignment == 0 ||
|
2017-12-13 09:16:34 +08:00
|
|
|
Alignment >= TypeSize / 8)) {
|
2017-12-09 08:21:41 +08:00
|
|
|
size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
|
2017-12-13 09:16:34 +08:00
|
|
|
if (ClInstrumentWithCalls) {
|
|
|
|
IRB.CreateCall(HwasanMemoryAccessCallback[IsWrite][AccessSizeIndex],
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
IRB.CreatePointerCast(Addr, IntptrTy));
|
2017-12-13 09:16:34 +08:00
|
|
|
} else {
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
instrumentMemAccessInline(Addr, IsWrite, AccessSizeIndex, I);
|
2017-12-13 09:16:34 +08:00
|
|
|
}
|
2017-12-09 08:21:41 +08:00
|
|
|
} else {
|
|
|
|
IRB.CreateCall(HwasanMemoryAccessCallbackSized[IsWrite],
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
{IRB.CreatePointerCast(Addr, IntptrTy),
|
|
|
|
ConstantInt::get(IntptrTy, TypeSize / 8)});
|
2017-12-09 08:21:41 +08:00
|
|
|
}
|
2018-03-24 01:57:54 +08:00
|
|
|
untagPointerOperand(I, Addr);
|
2017-12-09 08:21:41 +08:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-01-12 06:53:30 +08:00
|
|
|
static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
|
|
|
|
uint64_t ArraySize = 1;
|
|
|
|
if (AI.isArrayAllocation()) {
|
|
|
|
const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
|
|
|
|
assert(CI && "non-constant array size");
|
|
|
|
ArraySize = CI->getZExtValue();
|
|
|
|
}
|
|
|
|
Type *Ty = AI.getAllocatedType();
|
|
|
|
uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
|
|
|
|
return SizeInBytes * ArraySize;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
|
2019-07-10 04:22:36 +08:00
|
|
|
Value *Tag, size_t Size) {
|
2019-08-07 06:07:29 +08:00
|
|
|
size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
|
2018-01-12 06:53:30 +08:00
|
|
|
|
|
|
|
Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
|
|
|
|
if (ClInstrumentWithCalls) {
|
|
|
|
IRB.CreateCall(HwasanTagMemoryFunc,
|
2018-08-15 08:39:35 +08:00
|
|
|
{IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
|
2019-07-10 04:22:36 +08:00
|
|
|
ConstantInt::get(IntptrTy, AlignedSize)});
|
2018-01-12 06:53:30 +08:00
|
|
|
} else {
|
2018-04-21 04:04:04 +08:00
|
|
|
size_t ShadowSize = Size >> Mapping.Scale;
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
|
2018-01-12 06:53:30 +08:00
|
|
|
// If this memset is not inlined, it will be intercepted in the hwasan
|
|
|
|
// runtime library. That's OK, because the interceptor skips the checks if
|
|
|
|
// the address is in the shadow region.
|
|
|
|
// FIXME: the interceptor is not as fast as real memset. Consider lowering
|
|
|
|
// llvm.memset right here into either a sequence of stores, or a call to
|
|
|
|
// hwasan_tag_memory.
|
2019-07-10 04:22:36 +08:00
|
|
|
if (ShadowSize)
|
|
|
|
IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
|
|
|
|
if (Size != AlignedSize) {
|
|
|
|
IRB.CreateStore(
|
2019-08-07 06:07:29 +08:00
|
|
|
ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
|
2019-07-10 04:22:36 +08:00
|
|
|
IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
|
|
|
|
IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
|
|
|
|
Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
|
|
|
|
AlignedSize - 1));
|
|
|
|
}
|
2018-01-12 06:53:30 +08:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned RetagMask(unsigned AllocaNo) {
|
|
|
|
// A list of 8-bit numbers that have at most one run of non-zero bits.
|
|
|
|
// x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
|
|
|
|
// masks.
|
|
|
|
// The list does not include the value 255, which is used for UAR.
|
2019-06-18 07:39:51 +08:00
|
|
|
//
|
|
|
|
// Because we are more likely to use earlier elements of this list than later
|
|
|
|
// ones, it is sorted in increasing order of probability of collision with a
|
|
|
|
// mask allocated (temporally) nearby. The program that generated this list
|
|
|
|
// can be found at:
|
|
|
|
// https://github.com/google/sanitizers/blob/master/hwaddress-sanitizer/sort_masks.py
|
|
|
|
static unsigned FastMasks[] = {0, 128, 64, 192, 32, 96, 224, 112, 240,
|
|
|
|
48, 16, 120, 248, 56, 24, 8, 124, 252,
|
|
|
|
60, 28, 12, 4, 126, 254, 62, 30, 14,
|
|
|
|
6, 2, 127, 63, 31, 15, 7, 3, 1};
|
2018-01-12 06:53:30 +08:00
|
|
|
return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
|
|
|
|
}
|
|
|
|
|
2018-01-13 09:32:15 +08:00
|
|
|
Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
|
|
|
|
return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
|
|
|
|
}
|
2018-01-12 06:53:30 +08:00
|
|
|
|
2018-01-13 09:32:15 +08:00
|
|
|
Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
|
|
|
|
if (ClGenerateTagsWithCalls)
|
2018-09-25 07:03:34 +08:00
|
|
|
return getNextTagWithCall(IRB);
|
2019-06-18 07:39:51 +08:00
|
|
|
if (StackBaseTag)
|
|
|
|
return StackBaseTag;
|
2018-01-12 06:53:30 +08:00
|
|
|
// FIXME: use addressofreturnaddress (but implement it in aarch64 backend
|
|
|
|
// first).
|
2018-01-13 09:32:15 +08:00
|
|
|
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
|
2019-07-22 20:42:48 +08:00
|
|
|
auto GetStackPointerFn = Intrinsic::getDeclaration(
|
|
|
|
M, Intrinsic::frameaddress,
|
|
|
|
IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
|
2018-01-13 09:32:15 +08:00
|
|
|
Value *StackPointer = IRB.CreateCall(
|
|
|
|
GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
|
2018-01-12 06:53:30 +08:00
|
|
|
|
|
|
|
// Extract some entropy from the stack pointer for the tags.
|
|
|
|
// Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
|
|
|
|
// between functions).
|
|
|
|
Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
|
|
|
|
Value *StackTag =
|
|
|
|
IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
|
|
|
|
"hwasan.stack.base.tag");
|
2018-01-13 09:32:15 +08:00
|
|
|
return StackTag;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
|
|
|
|
AllocaInst *AI, unsigned AllocaNo) {
|
|
|
|
if (ClGenerateTagsWithCalls)
|
|
|
|
return getNextTagWithCall(IRB);
|
|
|
|
return IRB.CreateXor(StackTag,
|
|
|
|
ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
|
2018-06-30 04:20:17 +08:00
|
|
|
if (ClUARRetagToZero)
|
|
|
|
return ConstantInt::get(IntptrTy, 0);
|
2018-01-13 09:32:15 +08:00
|
|
|
if (ClGenerateTagsWithCalls)
|
|
|
|
return getNextTagWithCall(IRB);
|
|
|
|
return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
|
|
|
|
}
|
|
|
|
|
2018-02-09 08:59:10 +08:00
|
|
|
// Add a tag to an address.
|
2018-04-21 04:04:04 +08:00
|
|
|
Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
|
|
|
|
Value *PtrLong, Value *Tag) {
|
2018-02-09 08:59:10 +08:00
|
|
|
Value *TaggedPtrLong;
|
2018-04-14 02:05:21 +08:00
|
|
|
if (CompileKernel) {
|
2018-02-09 08:59:10 +08:00
|
|
|
// Kernel addresses have 0xFF in the most significant byte.
|
|
|
|
Value *ShiftedTag = IRB.CreateOr(
|
|
|
|
IRB.CreateShl(Tag, kPointerTagShift),
|
|
|
|
ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
|
|
|
|
TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
|
|
|
|
} else {
|
|
|
|
// Userspace can simply do OR (tag << 56);
|
|
|
|
Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
|
|
|
|
TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
|
|
|
|
}
|
|
|
|
return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
|
|
|
|
}
|
|
|
|
|
2018-02-22 03:52:23 +08:00
|
|
|
// Remove tag from an address.
|
|
|
|
Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
|
|
|
|
Value *UntaggedPtrLong;
|
2018-04-14 02:05:21 +08:00
|
|
|
if (CompileKernel) {
|
2018-02-22 03:52:23 +08:00
|
|
|
// Kernel addresses have 0xFF in the most significant byte.
|
|
|
|
UntaggedPtrLong = IRB.CreateOr(PtrLong,
|
|
|
|
ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
|
|
|
|
} else {
|
|
|
|
// Userspace addresses have 0x00.
|
|
|
|
UntaggedPtrLong = IRB.CreateAnd(PtrLong,
|
|
|
|
ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
|
|
|
|
}
|
|
|
|
return UntaggedPtrLong;
|
|
|
|
}
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
|
|
|
|
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
|
|
|
|
if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) {
|
2019-01-05 08:44:58 +08:00
|
|
|
// Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
|
|
|
|
// in Bionic's libc/private/bionic_tls.h.
|
2018-09-25 07:03:34 +08:00
|
|
|
Function *ThreadPointerFunc =
|
|
|
|
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
|
|
|
|
Value *SlotPtr = IRB.CreatePointerCast(
|
2019-02-02 04:44:47 +08:00
|
|
|
IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
|
|
|
|
IRB.CreateCall(ThreadPointerFunc), 0x30),
|
2018-09-25 07:03:34 +08:00
|
|
|
Ty->getPointerTo(0));
|
|
|
|
return SlotPtr;
|
|
|
|
}
|
|
|
|
if (ThreadPtrGlobal)
|
|
|
|
return ThreadPtrGlobal;
|
|
|
|
|
2018-09-25 06:50:32 +08:00
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-06-18 07:39:51 +08:00
|
|
|
void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
|
|
|
|
if (!Mapping.InTls) {
|
|
|
|
LocalDynamicShadow = getDynamicShadowNonTls(IRB);
|
|
|
|
return;
|
|
|
|
}
|
2018-09-25 07:03:34 +08:00
|
|
|
|
2019-06-18 07:39:51 +08:00
|
|
|
if (!WithFrameRecord && TargetTriple.isAndroid()) {
|
|
|
|
LocalDynamicShadow = getDynamicShadowIfunc(IRB);
|
|
|
|
return;
|
|
|
|
}
|
2019-01-24 06:39:11 +08:00
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
|
|
|
|
assert(SlotPtr);
|
|
|
|
|
2019-02-02 04:44:24 +08:00
|
|
|
Instruction *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
|
2019-01-05 03:27:04 +08:00
|
|
|
|
|
|
|
Function *F = IRB.GetInsertBlock()->getParent();
|
|
|
|
if (F->getFnAttribute("hwasan-abi").getValueAsString() == "interceptor") {
|
|
|
|
Value *ThreadLongEqZero =
|
|
|
|
IRB.CreateICmpEQ(ThreadLong, ConstantInt::get(IntptrTy, 0));
|
|
|
|
auto *Br = cast<BranchInst>(SplitBlockAndInsertIfThen(
|
|
|
|
ThreadLongEqZero, cast<Instruction>(ThreadLongEqZero)->getNextNode(),
|
|
|
|
false, MDBuilder(*C).createBranchWeights(1, 100000)));
|
|
|
|
|
|
|
|
IRB.SetInsertPoint(Br);
|
|
|
|
// FIXME: This should call a new runtime function with a custom calling
|
|
|
|
// convention to avoid needing to spill all arguments here.
|
|
|
|
IRB.CreateCall(HwasanThreadEnterFunc);
|
2019-02-02 04:44:24 +08:00
|
|
|
LoadInst *ReloadThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
|
2019-01-05 03:27:04 +08:00
|
|
|
|
|
|
|
IRB.SetInsertPoint(&*Br->getSuccessor(0)->begin());
|
|
|
|
PHINode *ThreadLongPhi = IRB.CreatePHI(IntptrTy, 2);
|
|
|
|
ThreadLongPhi->addIncoming(ThreadLong, ThreadLong->getParent());
|
|
|
|
ThreadLongPhi->addIncoming(ReloadThreadLong, ReloadThreadLong->getParent());
|
|
|
|
ThreadLong = ThreadLongPhi;
|
|
|
|
}
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
// Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI.
|
|
|
|
Value *ThreadLongMaybeUntagged =
|
|
|
|
TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
|
|
|
|
|
|
|
|
if (WithFrameRecord) {
|
2019-06-18 07:39:51 +08:00
|
|
|
StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
// Prepare ring buffer data.
|
2019-06-28 07:24:07 +08:00
|
|
|
Value *PC;
|
|
|
|
if (TargetTriple.getArch() == Triple::aarch64)
|
|
|
|
PC = readRegister(IRB, "pc");
|
|
|
|
else
|
|
|
|
PC = IRB.CreatePtrToInt(F, IntptrTy);
|
2019-07-22 20:42:48 +08:00
|
|
|
Module *M = F->getParent();
|
|
|
|
auto GetStackPointerFn = Intrinsic::getDeclaration(
|
|
|
|
M, Intrinsic::frameaddress,
|
|
|
|
IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
|
2018-09-25 07:03:34 +08:00
|
|
|
Value *SP = IRB.CreatePtrToInt(
|
|
|
|
IRB.CreateCall(GetStackPointerFn,
|
|
|
|
{Constant::getNullValue(IRB.getInt32Ty())}),
|
|
|
|
IntptrTy);
|
2019-06-18 07:39:51 +08:00
|
|
|
// Mix SP and PC.
|
2018-09-25 07:03:34 +08:00
|
|
|
// Assumptions:
|
|
|
|
// PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero)
|
|
|
|
// SP is 0xsssssssssssSSSS0 (4 lower bits are zero)
|
|
|
|
// We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
|
|
|
|
// 0xSSSSPPPPPPPPPPPP
|
|
|
|
SP = IRB.CreateShl(SP, 44);
|
|
|
|
|
|
|
|
// Store data to ring buffer.
|
|
|
|
Value *RecordPtr =
|
|
|
|
IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0));
|
|
|
|
IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr);
|
|
|
|
|
|
|
|
// Update the ring buffer. Top byte of ThreadLong defines the size of the
|
|
|
|
// buffer in pages, it must be a power of two, and the start of the buffer
|
|
|
|
// must be aligned by twice that much. Therefore wrap around of the ring
|
|
|
|
// buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
|
|
|
|
// The use of AShr instead of LShr is due to
|
|
|
|
// https://bugs.llvm.org/show_bug.cgi?id=39030
|
|
|
|
// Runtime library makes sure not to use the highest bit.
|
|
|
|
Value *WrapMask = IRB.CreateXor(
|
|
|
|
IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
|
|
|
|
ConstantInt::get(IntptrTy, (uint64_t)-1));
|
|
|
|
Value *ThreadLongNew = IRB.CreateAnd(
|
|
|
|
IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask);
|
|
|
|
IRB.CreateStore(ThreadLongNew, SlotPtr);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get shadow base address by aligning RecordPtr up.
|
|
|
|
// Note: this is not correct if the pointer is already aligned.
|
|
|
|
// Runtime library will make sure this never happens.
|
2019-06-18 07:39:51 +08:00
|
|
|
LocalDynamicShadow = IRB.CreateAdd(
|
2018-09-25 07:03:34 +08:00
|
|
|
IRB.CreateOr(
|
|
|
|
ThreadLongMaybeUntagged,
|
|
|
|
ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)),
|
|
|
|
ConstantInt::get(IntptrTy, 1), "hwasan.shadow");
|
2019-06-18 07:39:51 +08:00
|
|
|
LocalDynamicShadow = IRB.CreateIntToPtr(LocalDynamicShadow, Int8PtrTy);
|
2018-09-25 07:03:34 +08:00
|
|
|
}
|
2018-09-25 06:50:32 +08:00
|
|
|
|
2019-06-28 07:24:07 +08:00
|
|
|
Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
|
|
|
|
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
|
2019-05-17 07:54:41 +08:00
|
|
|
Function *ReadRegister =
|
|
|
|
Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
|
2019-06-28 07:24:07 +08:00
|
|
|
MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
|
2019-05-17 07:54:41 +08:00
|
|
|
Value *Args[] = {MetadataAsValue::get(*C, MD)};
|
2019-06-28 07:24:07 +08:00
|
|
|
return IRB.CreateCall(ReadRegister, Args);
|
|
|
|
}
|
2019-05-17 07:54:41 +08:00
|
|
|
|
2019-06-28 07:24:07 +08:00
|
|
|
bool HWAddressSanitizer::instrumentLandingPads(
|
|
|
|
SmallVectorImpl<Instruction *> &LandingPadVec) {
|
2019-05-17 07:54:41 +08:00
|
|
|
for (auto *LP : LandingPadVec) {
|
|
|
|
IRBuilder<> IRB(LP->getNextNode());
|
2019-06-28 07:24:07 +08:00
|
|
|
IRB.CreateCall(
|
|
|
|
HWAsanHandleVfork,
|
|
|
|
{readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
|
|
|
|
: "sp")});
|
2019-05-17 07:54:41 +08:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
bool HWAddressSanitizer::instrumentStack(
|
|
|
|
SmallVectorImpl<AllocaInst *> &Allocas,
|
2019-06-18 07:39:41 +08:00
|
|
|
DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap,
|
2018-09-25 07:03:34 +08:00
|
|
|
SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
|
2018-01-12 06:53:30 +08:00
|
|
|
// Ideally, we want to calculate tagged stack base pointer, and rewrite all
|
|
|
|
// alloca addresses using that. Unfortunately, offsets are not known yet
|
|
|
|
// (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
|
|
|
|
// temp, shift-OR it into each alloca address and xor with the retag mask.
|
|
|
|
// This generates one extra instruction per alloca use.
|
|
|
|
for (unsigned N = 0; N < Allocas.size(); ++N) {
|
|
|
|
auto *AI = Allocas[N];
|
2018-09-25 07:03:34 +08:00
|
|
|
IRBuilder<> IRB(AI->getNextNode());
|
2018-01-12 06:53:30 +08:00
|
|
|
|
|
|
|
// Replace uses of the alloca with tagged address.
|
2018-01-13 09:32:15 +08:00
|
|
|
Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
|
|
|
|
Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
|
2018-02-09 08:59:10 +08:00
|
|
|
Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
|
2018-01-12 06:53:30 +08:00
|
|
|
std::string Name =
|
|
|
|
AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
|
2018-02-09 08:59:10 +08:00
|
|
|
Replacement->setName(Name + ".hwasan");
|
2018-01-12 06:53:30 +08:00
|
|
|
|
[IR] Value: add replaceUsesWithIf() utility
Summary:
While there is always a `Value::replaceAllUsesWith()`,
sometimes the replacement needs to be conditional.
I have only cleaned a few cases where `replaceUsesWithIf()`
could be used, to both add test coverage,
and show that it is actually useful.
Reviewers: jdoerfert, spatel, RKSimon, craig.topper
Reviewed By: jdoerfert
Subscribers: dschuff, sbc100, jgravelle-google, hiraditya, aheejin, george.burgess.iv, asbirlea, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65528
llvm-svn: 367548
2019-08-01 20:32:08 +08:00
|
|
|
AI->replaceUsesWithIf(Replacement,
|
|
|
|
[AILong](Use &U) { return U.getUser() != AILong; });
|
2018-01-12 06:53:30 +08:00
|
|
|
|
2019-06-18 07:39:41 +08:00
|
|
|
for (auto *DDI : AllocaDeclareMap.lookup(AI)) {
|
|
|
|
DIExpression *OldExpr = DDI->getExpression();
|
|
|
|
DIExpression *NewExpr = DIExpression::append(
|
|
|
|
OldExpr, {dwarf::DW_OP_LLVM_tag_offset, RetagMask(N)});
|
|
|
|
DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr));
|
|
|
|
}
|
|
|
|
|
2019-07-10 04:22:36 +08:00
|
|
|
size_t Size = getAllocaSizeInBytes(*AI);
|
|
|
|
tagAlloca(IRB, AI, Tag, Size);
|
2018-01-12 06:53:30 +08:00
|
|
|
|
|
|
|
for (auto RI : RetVec) {
|
|
|
|
IRB.SetInsertPoint(RI);
|
|
|
|
|
|
|
|
// Re-tag alloca memory with the special UAR tag.
|
2018-01-13 09:32:15 +08:00
|
|
|
Value *Tag = getUARTag(IRB, StackTag);
|
2019-08-07 06:07:29 +08:00
|
|
|
tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
|
2018-01-12 06:53:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
|
|
|
|
return (AI.getAllocatedType()->isSized() &&
|
|
|
|
// FIXME: instrument dynamic allocas, too
|
|
|
|
AI.isStaticAlloca() &&
|
|
|
|
// alloca() may be called with 0 size, ignore it.
|
|
|
|
getAllocaSizeInBytes(AI) > 0 &&
|
|
|
|
// We are only interested in allocas not promotable to registers.
|
|
|
|
// Promotable allocas are common under -O0.
|
|
|
|
!isAllocaPromotable(&AI) &&
|
|
|
|
// inalloca allocas are not treated as static, and we don't want
|
|
|
|
// dynamic alloca instrumentation for them as well.
|
|
|
|
!AI.isUsedWithInAlloca() &&
|
|
|
|
// swifterror allocas are register promoted by ISel
|
|
|
|
!AI.isSwiftError());
|
|
|
|
}
|
|
|
|
|
2019-05-15 05:17:21 +08:00
|
|
|
bool HWAddressSanitizer::sanitizeFunction(Function &F) {
|
2017-12-09 08:21:41 +08:00
|
|
|
if (&F == HwasanCtorFunction)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
|
|
|
|
return false;
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
|
2017-12-09 08:21:41 +08:00
|
|
|
|
|
|
|
SmallVector<Instruction*, 16> ToInstrument;
|
2018-01-12 06:53:30 +08:00
|
|
|
SmallVector<AllocaInst*, 8> AllocasToInstrument;
|
|
|
|
SmallVector<Instruction*, 8> RetVec;
|
2019-05-17 07:54:41 +08:00
|
|
|
SmallVector<Instruction*, 8> LandingPadVec;
|
2019-06-18 07:39:41 +08:00
|
|
|
DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> AllocaDeclareMap;
|
2017-12-09 08:21:41 +08:00
|
|
|
for (auto &BB : F) {
|
|
|
|
for (auto &Inst : BB) {
|
2018-01-12 06:53:30 +08:00
|
|
|
if (ClInstrumentStack)
|
|
|
|
if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
|
|
|
|
if (isInterestingAlloca(*AI))
|
|
|
|
AllocasToInstrument.push_back(AI);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
|
|
|
|
isa<CleanupReturnInst>(Inst))
|
2018-01-12 06:53:30 +08:00
|
|
|
RetVec.push_back(&Inst);
|
|
|
|
|
2019-06-18 07:39:41 +08:00
|
|
|
if (auto *DDI = dyn_cast<DbgDeclareInst>(&Inst))
|
|
|
|
if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress()))
|
|
|
|
AllocaDeclareMap[Alloca].push_back(DDI);
|
|
|
|
|
2019-05-17 07:54:41 +08:00
|
|
|
if (ClInstrumentLandingPads && isa<LandingPadInst>(Inst))
|
|
|
|
LandingPadVec.push_back(&Inst);
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
Value *MaybeMask = nullptr;
|
|
|
|
bool IsWrite;
|
|
|
|
unsigned Alignment;
|
|
|
|
uint64_t TypeSize;
|
|
|
|
Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
|
|
|
|
&Alignment, &MaybeMask);
|
|
|
|
if (Addr || isa<MemIntrinsic>(Inst))
|
|
|
|
ToInstrument.push_back(&Inst);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-17 07:54:41 +08:00
|
|
|
initializeCallbacks(*F.getParent());
|
|
|
|
|
|
|
|
if (!LandingPadVec.empty())
|
|
|
|
instrumentLandingPads(LandingPadVec);
|
|
|
|
|
2018-09-25 07:03:34 +08:00
|
|
|
if (AllocasToInstrument.empty() && ToInstrument.empty())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
assert(!LocalDynamicShadow);
|
|
|
|
|
|
|
|
Instruction *InsertPt = &*F.getEntryBlock().begin();
|
|
|
|
IRBuilder<> EntryIRB(InsertPt);
|
2019-06-18 07:39:51 +08:00
|
|
|
emitPrologue(EntryIRB,
|
|
|
|
/*WithFrameRecord*/ ClRecordStackHistory &&
|
|
|
|
!AllocasToInstrument.empty());
|
2018-09-25 07:03:34 +08:00
|
|
|
|
|
|
|
bool Changed = false;
|
|
|
|
if (!AllocasToInstrument.empty()) {
|
|
|
|
Value *StackTag =
|
|
|
|
ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
|
2019-06-18 07:39:41 +08:00
|
|
|
Changed |= instrumentStack(AllocasToInstrument, AllocaDeclareMap, RetVec,
|
|
|
|
StackTag);
|
2018-09-25 07:03:34 +08:00
|
|
|
}
|
2018-01-12 06:53:30 +08:00
|
|
|
|
2019-07-10 04:22:36 +08:00
|
|
|
// Pad and align each of the allocas that we instrumented to stop small
|
|
|
|
// uninteresting allocas from hiding in instrumented alloca's padding and so
|
|
|
|
// that we have enough space to store real tags for short granules.
|
|
|
|
DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
|
|
|
|
for (AllocaInst *AI : AllocasToInstrument) {
|
|
|
|
uint64_t Size = getAllocaSizeInBytes(*AI);
|
2019-08-07 06:07:29 +08:00
|
|
|
uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
|
|
|
|
AI->setAlignment(
|
|
|
|
std::max(AI->getAlignment(), Mapping.getObjectAlignment()));
|
2019-07-10 04:22:36 +08:00
|
|
|
if (Size != AlignedSize) {
|
2019-07-16 11:25:50 +08:00
|
|
|
Type *AllocatedType = AI->getAllocatedType();
|
|
|
|
if (AI->isArrayAllocation()) {
|
|
|
|
uint64_t ArraySize =
|
|
|
|
cast<ConstantInt>(AI->getArraySize())->getZExtValue();
|
|
|
|
AllocatedType = ArrayType::get(AllocatedType, ArraySize);
|
|
|
|
}
|
2019-07-10 04:22:36 +08:00
|
|
|
Type *TypeWithPadding = StructType::get(
|
2019-07-16 11:25:50 +08:00
|
|
|
AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
|
2019-07-10 04:22:36 +08:00
|
|
|
auto *NewAI = new AllocaInst(
|
|
|
|
TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
|
|
|
|
NewAI->takeName(AI);
|
|
|
|
NewAI->setAlignment(AI->getAlignment());
|
|
|
|
NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
|
|
|
|
NewAI->setSwiftError(AI->isSwiftError());
|
|
|
|
NewAI->copyMetadata(*AI);
|
2019-07-16 11:25:50 +08:00
|
|
|
auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
|
|
|
|
AI->replaceAllUsesWith(Bitcast);
|
2019-07-10 04:22:36 +08:00
|
|
|
AllocaToPaddedAllocaMap[AI] = NewAI;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!AllocaToPaddedAllocaMap.empty()) {
|
|
|
|
for (auto &BB : F)
|
|
|
|
for (auto &Inst : BB)
|
|
|
|
if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
|
|
|
|
if (auto *AI =
|
|
|
|
dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
|
|
|
|
if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
|
|
|
|
DVI->setArgOperand(
|
|
|
|
0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
|
|
|
|
for (auto &P : AllocaToPaddedAllocaMap)
|
|
|
|
P.first->eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2019-01-25 10:08:46 +08:00
|
|
|
// If we split the entry block, move any allocas that were originally in the
|
|
|
|
// entry block back into the entry block so that they aren't treated as
|
|
|
|
// dynamic allocas.
|
|
|
|
if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
|
|
|
|
InsertPt = &*F.getEntryBlock().begin();
|
|
|
|
for (auto II = EntryIRB.GetInsertBlock()->begin(),
|
|
|
|
IE = EntryIRB.GetInsertBlock()->end();
|
|
|
|
II != IE;) {
|
|
|
|
Instruction *I = &*II++;
|
|
|
|
if (auto *AI = dyn_cast<AllocaInst>(I))
|
|
|
|
if (isa<ConstantInt>(AI->getArraySize()))
|
|
|
|
I->moveBefore(InsertPt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
for (auto Inst : ToInstrument)
|
|
|
|
Changed |= instrumentMemAccess(Inst);
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
LocalDynamicShadow = nullptr;
|
2019-06-18 07:39:51 +08:00
|
|
|
StackBaseTag = nullptr;
|
2018-04-21 04:04:04 +08:00
|
|
|
|
2017-12-09 08:21:41 +08:00
|
|
|
return Changed;
|
|
|
|
}
|
2018-04-21 04:04:04 +08:00
|
|
|
|
2019-08-07 06:07:29 +08:00
|
|
|
void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
|
|
|
|
Constant *Initializer = GV->getInitializer();
|
|
|
|
uint64_t SizeInBytes =
|
|
|
|
M.getDataLayout().getTypeAllocSize(Initializer->getType());
|
|
|
|
uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
|
|
|
|
if (SizeInBytes != NewSize) {
|
|
|
|
// Pad the initializer out to the next multiple of 16 bytes and add the
|
|
|
|
// required short granule tag.
|
|
|
|
std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
|
|
|
|
Init.back() = Tag;
|
|
|
|
Constant *Padding = ConstantDataArray::get(*C, Init);
|
|
|
|
Initializer = ConstantStruct::getAnon({Initializer, Padding});
|
|
|
|
}
|
|
|
|
|
|
|
|
auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
|
|
|
|
GlobalValue::ExternalLinkage, Initializer,
|
|
|
|
GV->getName() + ".hwasan");
|
|
|
|
NewGV->copyAttributesFrom(GV);
|
|
|
|
NewGV->setLinkage(GlobalValue::PrivateLinkage);
|
|
|
|
NewGV->copyMetadata(GV, 0);
|
|
|
|
NewGV->setAlignment(
|
|
|
|
std::max(GV->getAlignment(), Mapping.getObjectAlignment()));
|
|
|
|
|
|
|
|
// It is invalid to ICF two globals that have different tags. In the case
|
|
|
|
// where the size of the global is a multiple of the tag granularity the
|
|
|
|
// contents of the globals may be the same but the tags (i.e. symbol values)
|
|
|
|
// may be different, and the symbols are not considered during ICF. In the
|
|
|
|
// case where the size is not a multiple of the granularity, the short granule
|
|
|
|
// tags would discriminate two globals with different tags, but there would
|
|
|
|
// otherwise be nothing stopping such a global from being incorrectly ICF'd
|
|
|
|
// with an uninstrumented (i.e. tag 0) global that happened to have the short
|
|
|
|
// granule tag in the last byte.
|
|
|
|
NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
|
|
|
|
|
|
|
|
// Descriptor format (assuming little-endian):
|
|
|
|
// bytes 0-3: relative address of global
|
|
|
|
// bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
|
|
|
|
// it isn't, we create multiple descriptors)
|
|
|
|
// byte 7: tag
|
|
|
|
auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
|
|
|
|
const uint64_t MaxDescriptorSize = 0xfffff0;
|
|
|
|
for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
|
|
|
|
DescriptorPos += MaxDescriptorSize) {
|
|
|
|
auto *Descriptor =
|
|
|
|
new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
|
|
|
|
nullptr, GV->getName() + ".hwasan.descriptor");
|
|
|
|
auto *GVRelPtr = ConstantExpr::getTrunc(
|
|
|
|
ConstantExpr::getAdd(
|
|
|
|
ConstantExpr::getSub(
|
|
|
|
ConstantExpr::getPtrToInt(NewGV, Int64Ty),
|
|
|
|
ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
|
|
|
|
ConstantInt::get(Int64Ty, DescriptorPos)),
|
|
|
|
Int32Ty);
|
|
|
|
uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
|
|
|
|
auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
|
|
|
|
Descriptor->setComdat(NewGV->getComdat());
|
|
|
|
Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
|
|
|
|
Descriptor->setSection("hwasan_globals");
|
|
|
|
Descriptor->setMetadata(LLVMContext::MD_associated,
|
|
|
|
MDNode::get(*C, ValueAsMetadata::get(NewGV)));
|
|
|
|
appendToCompilerUsed(M, Descriptor);
|
|
|
|
}
|
|
|
|
|
|
|
|
Constant *Aliasee = ConstantExpr::getIntToPtr(
|
|
|
|
ConstantExpr::getAdd(
|
|
|
|
ConstantExpr::getPtrToInt(NewGV, Int64Ty),
|
|
|
|
ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
|
|
|
|
GV->getType());
|
|
|
|
auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
|
|
|
|
GV->getLinkage(), "", Aliasee, &M);
|
|
|
|
Alias->setVisibility(GV->getVisibility());
|
|
|
|
Alias->takeName(GV);
|
|
|
|
GV->replaceAllUsesWith(Alias);
|
|
|
|
GV->eraseFromParent();
|
|
|
|
}
|
|
|
|
|
|
|
|
void HWAddressSanitizer::instrumentGlobals() {
|
|
|
|
// Start by creating a note that contains pointers to the list of global
|
|
|
|
// descriptors. Adding a note to the output file will cause the linker to
|
|
|
|
// create a PT_NOTE program header pointing to the note that we can use to
|
|
|
|
// find the descriptor list starting from the program headers. A function
|
|
|
|
// provided by the runtime initializes the shadow memory for the globals by
|
|
|
|
// accessing the descriptor list via the note. The dynamic loader needs to
|
|
|
|
// call this function whenever a library is loaded.
|
|
|
|
//
|
|
|
|
// The reason why we use a note for this instead of a more conventional
|
|
|
|
// approach of having a global constructor pass a descriptor list pointer to
|
|
|
|
// the runtime is because of an order of initialization problem. With
|
|
|
|
// constructors we can encounter the following problematic scenario:
|
|
|
|
//
|
|
|
|
// 1) library A depends on library B and also interposes one of B's symbols
|
|
|
|
// 2) B's constructors are called before A's (as required for correctness)
|
|
|
|
// 3) during construction, B accesses one of its "own" globals (actually
|
|
|
|
// interposed by A) and triggers a HWASAN failure due to the initialization
|
|
|
|
// for A not having happened yet
|
|
|
|
//
|
|
|
|
// Even without interposition it is possible to run into similar situations in
|
|
|
|
// cases where two libraries mutually depend on each other.
|
|
|
|
//
|
|
|
|
// We only need one note per binary, so put everything for the note in a
|
|
|
|
// comdat.
|
|
|
|
Comdat *NoteComdat = M.getOrInsertComdat(kHwasanNoteName);
|
|
|
|
|
|
|
|
Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
|
|
|
|
auto Start =
|
|
|
|
new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
|
|
|
|
nullptr, "__start_hwasan_globals");
|
|
|
|
Start->setVisibility(GlobalValue::HiddenVisibility);
|
|
|
|
Start->setDSOLocal(true);
|
|
|
|
auto Stop =
|
|
|
|
new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
|
|
|
|
nullptr, "__stop_hwasan_globals");
|
|
|
|
Stop->setVisibility(GlobalValue::HiddenVisibility);
|
|
|
|
Stop->setDSOLocal(true);
|
|
|
|
|
|
|
|
// Null-terminated so actually 8 bytes, which are required in order to align
|
|
|
|
// the note properly.
|
|
|
|
auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0");
|
|
|
|
|
|
|
|
auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(),
|
|
|
|
Int32Ty, Int32Ty);
|
|
|
|
auto *Note =
|
|
|
|
new GlobalVariable(M, NoteTy, /*isConstantGlobal=*/true,
|
|
|
|
GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName);
|
|
|
|
Note->setSection(".note.hwasan.globals");
|
|
|
|
Note->setComdat(NoteComdat);
|
|
|
|
Note->setAlignment(4);
|
|
|
|
Note->setDSOLocal(true);
|
|
|
|
|
|
|
|
// The pointers in the note need to be relative so that the note ends up being
|
|
|
|
// placed in rodata, which is the standard location for notes.
|
|
|
|
auto CreateRelPtr = [&](Constant *Ptr) {
|
|
|
|
return ConstantExpr::getTrunc(
|
|
|
|
ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty),
|
|
|
|
ConstantExpr::getPtrToInt(Note, Int64Ty)),
|
|
|
|
Int32Ty);
|
|
|
|
};
|
|
|
|
Note->setInitializer(ConstantStruct::getAnon(
|
|
|
|
{ConstantInt::get(Int32Ty, 8), // n_namesz
|
|
|
|
ConstantInt::get(Int32Ty, 8), // n_descsz
|
|
|
|
ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type
|
|
|
|
Name, CreateRelPtr(Start), CreateRelPtr(Stop)}));
|
|
|
|
appendToCompilerUsed(M, Note);
|
|
|
|
|
|
|
|
// Create a zero-length global in hwasan_globals so that the linker will
|
|
|
|
// always create start and stop symbols.
|
|
|
|
auto Dummy = new GlobalVariable(
|
|
|
|
M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
|
|
|
|
Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global");
|
|
|
|
Dummy->setSection("hwasan_globals");
|
|
|
|
Dummy->setComdat(NoteComdat);
|
|
|
|
Dummy->setMetadata(LLVMContext::MD_associated,
|
|
|
|
MDNode::get(*C, ValueAsMetadata::get(Note)));
|
|
|
|
appendToCompilerUsed(M, Dummy);
|
|
|
|
|
|
|
|
std::vector<GlobalVariable *> Globals;
|
|
|
|
for (GlobalVariable &GV : M.globals()) {
|
|
|
|
if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
|
|
|
|
GV.isThreadLocal())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Common symbols can't have aliases point to them, so they can't be tagged.
|
|
|
|
if (GV.hasCommonLinkage())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Globals with custom sections may be used in __start_/__stop_ enumeration,
|
|
|
|
// which would be broken both by adding tags and potentially by the extra
|
|
|
|
// padding/alignment that we insert.
|
|
|
|
if (GV.hasSection())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Globals.push_back(&GV);
|
|
|
|
}
|
|
|
|
|
|
|
|
MD5 Hasher;
|
|
|
|
Hasher.update(M.getSourceFileName());
|
|
|
|
MD5::MD5Result Hash;
|
|
|
|
Hasher.final(Hash);
|
|
|
|
uint8_t Tag = Hash[0];
|
|
|
|
|
|
|
|
for (GlobalVariable *GV : Globals) {
|
|
|
|
// Skip tag 0 in order to avoid collisions with untagged memory.
|
|
|
|
if (Tag == 0)
|
|
|
|
Tag = 1;
|
|
|
|
instrumentGlobal(GV, Tag++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-21 04:04:04 +08:00
|
|
|
void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
|
|
|
|
Scale = kDefaultShadowScale;
|
2018-08-11 00:21:37 +08:00
|
|
|
if (ClMappingOffset.getNumOccurrences() > 0) {
|
|
|
|
InGlobal = false;
|
2018-09-25 07:03:34 +08:00
|
|
|
InTls = false;
|
2018-08-11 00:21:37 +08:00
|
|
|
Offset = ClMappingOffset;
|
|
|
|
} else if (ClEnableKhwasan || ClInstrumentWithCalls) {
|
|
|
|
InGlobal = false;
|
2018-09-25 07:03:34 +08:00
|
|
|
InTls = false;
|
2018-04-21 04:04:04 +08:00
|
|
|
Offset = 0;
|
2018-09-25 07:03:34 +08:00
|
|
|
} else if (ClWithIfunc) {
|
2018-08-11 00:21:37 +08:00
|
|
|
InGlobal = true;
|
2018-09-25 07:03:34 +08:00
|
|
|
InTls = false;
|
|
|
|
Offset = kDynamicShadowSentinel;
|
|
|
|
} else if (ClWithTls) {
|
|
|
|
InGlobal = false;
|
|
|
|
InTls = true;
|
2018-04-21 04:04:04 +08:00
|
|
|
Offset = kDynamicShadowSentinel;
|
2018-08-11 00:21:37 +08:00
|
|
|
} else {
|
|
|
|
InGlobal = false;
|
2018-09-25 07:03:34 +08:00
|
|
|
InTls = false;
|
2018-08-11 00:21:37 +08:00
|
|
|
Offset = kDynamicShadowSentinel;
|
|
|
|
}
|
2018-04-21 04:04:04 +08:00
|
|
|
}
|