llvm-project/compiler-rt/lib/hwasan/hwasan_linux.cpp

458 lines
14 KiB
C++
Raw Normal View History

//===-- hwasan_linux.cpp ----------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file is a part of HWAddressSanitizer and contains Linux-, NetBSD- and
/// FreeBSD-specific code.
///
//===----------------------------------------------------------------------===//
#include "sanitizer_common/sanitizer_platform.h"
#if SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_NETBSD
#include "hwasan.h"
#include "hwasan_dynamic_shadow.h"
#include "hwasan_interface_internal.h"
#include "hwasan_mapping.h"
#include "hwasan_report.h"
#include "hwasan_thread.h"
#include "hwasan_thread_list.h"
#include <dlfcn.h>
#include <elf.h>
#include <link.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <unistd.h>
#include <unwind.h>
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_procmaps.h"
// Configurations of HWASAN_WITH_INTERCEPTORS and SANITIZER_ANDROID.
//
// HWASAN_WITH_INTERCEPTORS=OFF, SANITIZER_ANDROID=OFF
// Not currently tested.
// HWASAN_WITH_INTERCEPTORS=OFF, SANITIZER_ANDROID=ON
// Integration tests downstream exist.
// HWASAN_WITH_INTERCEPTORS=ON, SANITIZER_ANDROID=OFF
// Tested with check-hwasan on x86_64-linux.
// HWASAN_WITH_INTERCEPTORS=ON, SANITIZER_ANDROID=ON
// Tested with check-hwasan on aarch64-linux-android.
#if !SANITIZER_ANDROID
hwasan: Use system allocator to realloc and free untagged pointers in interceptor mode. The Android dynamic loader has a non-standard feature that allows libraries such as the hwasan runtime to interpose symbols even after the symbol already has a value. The new value of the symbol is used to relocate libraries loaded after the interposing library, but existing libraries keep the old value. This behaviour is activated by the DF_1_GLOBAL flag in DT_FLAGS_1, which is set by passing -z global to the linker, which is what we already do to link the hwasan runtime. What this means in practice is that if we have .so files that depend on interceptor-mode hwasan without the main executable depending on it, some of the libraries in the process will be using the hwasan allocator and some will be using the system allocator, and these allocators need to interact somehow. For example, if an instrumented library calls a function such as strdup that allocates memory on behalf of the caller, the instrumented library can reasonably expect to be able to call free to deallocate the memory. We can handle that relatively easily with hwasan by using tag 0 to represent allocations from the system allocator. If hwasan's realloc or free functions are passed a pointer with tag 0, the system allocator is called. One limitation is that this scheme doesn't work in reverse: if an instrumented library allocates memory, it must free the memory itself and cannot pass ownership to a system library. In a future change, we may want to expose an API for calling the system allocator so that instrumented libraries can safely transfer ownership of memory to system libraries. Differential Revision: https://reviews.llvm.org/D55986 llvm-svn: 350427
2019-01-05 03:21:51 +08:00
SANITIZER_INTERFACE_ATTRIBUTE
THREADLOCAL uptr __hwasan_tls;
#endif
namespace __hwasan {
static void ReserveShadowMemoryRange(uptr beg, uptr end, const char *name) {
CHECK_EQ((beg % GetMmapGranularity()), 0);
CHECK_EQ(((end + 1) % GetMmapGranularity()), 0);
uptr size = end - beg + 1;
DecreaseTotalMmap(size); // Don't count the shadow against mmap_limit_mb.
if (!MmapFixedNoReserve(beg, size, name)) {
Report(
"ReserveShadowMemoryRange failed while trying to map 0x%zx bytes. "
"Perhaps you're using ulimit -v\n",
size);
Abort();
}
}
static void ProtectGap(uptr addr, uptr size) {
if (!size)
return;
void *res = MmapFixedNoAccess(addr, size, "shadow gap");
if (addr == (uptr)res)
return;
// A few pages at the start of the address space can not be protected.
// But we really want to protect as much as possible, to prevent this memory
// being returned as a result of a non-FIXED mmap().
if (addr == 0) {
uptr step = GetMmapGranularity();
while (size > step) {
addr += step;
size -= step;
void *res = MmapFixedNoAccess(addr, size, "shadow gap");
if (addr == (uptr)res)
return;
}
}
Report(
"ERROR: Failed to protect shadow gap [%p, %p]. "
"HWASan cannot proceed correctly. ABORTING.\n", (void *)addr,
(void *)(addr + size));
DumpProcessMap();
Die();
}
static uptr kLowMemStart;
static uptr kLowMemEnd;
static uptr kLowShadowEnd;
static uptr kLowShadowStart;
static uptr kHighShadowStart;
static uptr kHighShadowEnd;
static uptr kHighMemStart;
static uptr kHighMemEnd;
static void PrintRange(uptr start, uptr end, const char *name) {
Printf("|| [%p, %p] || %.*s ||\n", (void *)start, (void *)end, 10, name);
}
static void PrintAddressSpaceLayout() {
PrintRange(kHighMemStart, kHighMemEnd, "HighMem");
if (kHighShadowEnd + 1 < kHighMemStart)
PrintRange(kHighShadowEnd + 1, kHighMemStart - 1, "ShadowGap");
else
CHECK_EQ(kHighShadowEnd + 1, kHighMemStart);
PrintRange(kHighShadowStart, kHighShadowEnd, "HighShadow");
if (kLowShadowEnd + 1 < kHighShadowStart)
PrintRange(kLowShadowEnd + 1, kHighShadowStart - 1, "ShadowGap");
else
CHECK_EQ(kLowMemEnd + 1, kHighShadowStart);
PrintRange(kLowShadowStart, kLowShadowEnd, "LowShadow");
if (kLowMemEnd + 1 < kLowShadowStart)
PrintRange(kLowMemEnd + 1, kLowShadowStart - 1, "ShadowGap");
else
CHECK_EQ(kLowMemEnd + 1, kLowShadowStart);
PrintRange(kLowMemStart, kLowMemEnd, "LowMem");
CHECK_EQ(0, kLowMemStart);
}
static uptr GetHighMemEnd() {
// HighMem covers the upper part of the address space.
uptr max_address = GetMaxUserVirtualAddress();
// Adjust max address to make sure that kHighMemEnd and kHighMemStart are
// properly aligned:
max_address |= (GetMmapGranularity() << kShadowScale) - 1;
return max_address;
}
static void InitializeShadowBaseAddress(uptr shadow_size_bytes) {
__hwasan_shadow_memory_dynamic_address =
FindDynamicShadowStart(shadow_size_bytes);
}
bool InitShadow() {
// Define the entire memory range.
kHighMemEnd = GetHighMemEnd();
// Determine shadow memory base offset.
InitializeShadowBaseAddress(MemToShadowSize(kHighMemEnd));
// Place the low memory first.
kLowMemEnd = __hwasan_shadow_memory_dynamic_address - 1;
kLowMemStart = 0;
// Define the low shadow based on the already placed low memory.
kLowShadowEnd = MemToShadow(kLowMemEnd);
kLowShadowStart = __hwasan_shadow_memory_dynamic_address;
// High shadow takes whatever memory is left up there (making sure it is not
// interfering with low memory in the fixed case).
kHighShadowEnd = MemToShadow(kHighMemEnd);
kHighShadowStart = Max(kLowMemEnd, MemToShadow(kHighShadowEnd)) + 1;
// High memory starts where allocated shadow allows.
kHighMemStart = ShadowToMem(kHighShadowStart);
// Check the sanity of the defined memory ranges (there might be gaps).
CHECK_EQ(kHighMemStart % GetMmapGranularity(), 0);
CHECK_GT(kHighMemStart, kHighShadowEnd);
CHECK_GT(kHighShadowEnd, kHighShadowStart);
CHECK_GT(kHighShadowStart, kLowMemEnd);
CHECK_GT(kLowMemEnd, kLowMemStart);
CHECK_GT(kLowShadowEnd, kLowShadowStart);
CHECK_GT(kLowShadowStart, kLowMemEnd);
if (Verbosity())
PrintAddressSpaceLayout();
// Reserve shadow memory.
ReserveShadowMemoryRange(kLowShadowStart, kLowShadowEnd, "low shadow");
ReserveShadowMemoryRange(kHighShadowStart, kHighShadowEnd, "high shadow");
// Protect all the gaps.
ProtectGap(0, Min(kLowMemStart, kLowShadowStart));
if (kLowMemEnd + 1 < kLowShadowStart)
ProtectGap(kLowMemEnd + 1, kLowShadowStart - kLowMemEnd - 1);
if (kLowShadowEnd + 1 < kHighShadowStart)
ProtectGap(kLowShadowEnd + 1, kHighShadowStart - kLowShadowEnd - 1);
if (kHighShadowEnd + 1 < kHighMemStart)
ProtectGap(kHighShadowEnd + 1, kHighMemStart - kHighShadowEnd - 1);
return true;
}
void InitThreads() {
CHECK(__hwasan_shadow_memory_dynamic_address);
uptr guard_page_size = GetMmapGranularity();
uptr thread_space_start =
__hwasan_shadow_memory_dynamic_address - (1ULL << kShadowBaseAlignment);
uptr thread_space_end =
__hwasan_shadow_memory_dynamic_address - guard_page_size;
ReserveShadowMemoryRange(thread_space_start, thread_space_end - 1,
"hwasan threads");
ProtectGap(thread_space_end,
__hwasan_shadow_memory_dynamic_address - thread_space_end);
InitThreadList(thread_space_start, thread_space_end - thread_space_start);
}
static void MadviseShadowRegion(uptr beg, uptr end) {
uptr size = end - beg + 1;
SetShadowRegionHugePageMode(beg, size);
if (common_flags()->use_madv_dontdump)
DontDumpShadowMemory(beg, size);
}
void MadviseShadow() {
MadviseShadowRegion(kLowShadowStart, kLowShadowEnd);
MadviseShadowRegion(kHighShadowStart, kHighShadowEnd);
}
bool MemIsApp(uptr p) {
CHECK(GetTagFromPointer(p) == 0);
return p >= kHighMemStart || (p >= kLowMemStart && p <= kLowMemEnd);
}
static void HwasanAtExit(void) {
if (common_flags()->print_module_map)
DumpProcessMap();
if (flags()->print_stats && (flags()->atexit || hwasan_report_count > 0))
ReportStats();
if (hwasan_report_count > 0) {
// ReportAtExitStatistics();
if (common_flags()->exitcode)
internal__exit(common_flags()->exitcode);
}
}
void InstallAtExitHandler() {
atexit(HwasanAtExit);
}
// ---------------------- TSD ---------------- {{{1
extern "C" void __hwasan_thread_enter() {
hwasanThreadList().CreateCurrentThread()->InitRandomState();
}
extern "C" void __hwasan_thread_exit() {
Thread *t = GetCurrentThread();
// Make sure that signal handler can not see a stale current thread pointer.
atomic_signal_fence(memory_order_seq_cst);
if (t)
hwasanThreadList().ReleaseThread(t);
}
#if HWASAN_WITH_INTERCEPTORS
static pthread_key_t tsd_key;
static bool tsd_key_inited = false;
void HwasanTSDThreadInit() {
if (tsd_key_inited)
CHECK_EQ(0, pthread_setspecific(tsd_key,
(void *)GetPthreadDestructorIterations()));
}
void HwasanTSDDtor(void *tsd) {
uptr iterations = (uptr)tsd;
if (iterations > 1) {
CHECK_EQ(0, pthread_setspecific(tsd_key, (void *)(iterations - 1)));
return;
}
__hwasan_thread_exit();
}
void HwasanTSDInit() {
CHECK(!tsd_key_inited);
tsd_key_inited = true;
CHECK_EQ(0, pthread_key_create(&tsd_key, HwasanTSDDtor));
}
#else
void HwasanTSDInit() {}
void HwasanTSDThreadInit() {}
#endif
#if SANITIZER_ANDROID
uptr *GetCurrentThreadLongPtr() {
return (uptr *)get_android_tls_ptr();
}
#else
uptr *GetCurrentThreadLongPtr() {
return &__hwasan_tls;
}
#endif
#if SANITIZER_ANDROID
void AndroidTestTlsSlot() {
uptr kMagicValue = 0x010203040A0B0C0D;
uptr *tls_ptr = GetCurrentThreadLongPtr();
uptr old_value = *tls_ptr;
*tls_ptr = kMagicValue;
dlerror();
if (*(uptr *)get_android_tls_ptr() != kMagicValue) {
Printf(
"ERROR: Incompatible version of Android: TLS_SLOT_SANITIZER(6) is used "
"for dlerror().\n");
Die();
}
*tls_ptr = old_value;
}
#else
void AndroidTestTlsSlot() {}
#endif
Thread *GetCurrentThread() {
hwasan: Implement lazy thread initialization for the interceptor ABI. The problem is similar to D55986 but for threads: a process with the interceptor hwasan library loaded might have some threads started by instrumented libraries and some by uninstrumented libraries, and we need to be able to run instrumented code on the latter. The solution is to perform per-thread initialization lazily. If a function needs to access shadow memory or add itself to the per-thread ring buffer its prologue checks to see whether the value in the sanitizer TLS slot is null, and if so it calls __hwasan_thread_enter and reloads from the TLS slot. The runtime does the same thing if it needs to access this data structure. This change means that the code generator needs to know whether we are targeting the interceptor runtime, since we don't want to pay the cost of lazy initialization when targeting a platform with native hwasan support. A flag -fsanitize-hwaddress-abi={interceptor,platform} has been introduced for selecting the runtime ABI to target. The default ABI is set to interceptor since it's assumed that it will be more common that users will be compiling application code than platform code. Because we can no longer assume that the TLS slot is initialized, the pthread_create interceptor is no longer necessary, so it has been removed. Ideally, lazy initialization should only cost one instruction in the hot path, but at present the call may cause us to spill arguments to the stack, which means more instructions in the hot path (or theoretically in the cold path if the spills are moved with shrink wrapping). With an appropriately chosen calling convention for the per-thread initialization function (TODO) the hot path should always need just one instruction and the cold path should need two instructions with no spilling required. Differential Revision: https://reviews.llvm.org/D56038 llvm-svn: 350429
2019-01-05 03:27:04 +08:00
uptr *ThreadLong = GetCurrentThreadLongPtr();
#if HWASAN_WITH_INTERCEPTORS
if (!*ThreadLong)
__hwasan_thread_enter();
#endif
auto *R = (StackAllocationsRingBuffer *)ThreadLong;
return hwasanThreadList().GetThreadByBufferAddress((uptr)(R->Next()));
}
struct AccessInfo {
uptr addr;
uptr size;
bool is_store;
bool is_load;
bool recover;
};
static AccessInfo GetAccessInfo(siginfo_t *info, ucontext_t *uc) {
// Access type is passed in a platform dependent way (see below) and encoded
// as 0xXY, where X&1 is 1 for store, 0 for load, and X&2 is 1 if the error is
// recoverable. Valid values of Y are 0 to 4, which are interpreted as
// log2(access_size), and 0xF, which means that access size is passed via
// platform dependent register (see below).
#if defined(__aarch64__)
// Access type is encoded in BRK immediate as 0x900 + 0xXY. For Y == 0xF,
// access size is stored in X1 register. Access address is always in X0
// register.
uptr pc = (uptr)info->si_addr;
const unsigned code = ((*(u32 *)pc) >> 5) & 0xffff;
if ((code & 0xff00) != 0x900)
return AccessInfo{}; // Not ours.
const bool is_store = code & 0x10;
const bool recover = code & 0x20;
const uptr addr = uc->uc_mcontext.regs[0];
const unsigned size_log = code & 0xf;
if (size_log > 4 && size_log != 0xf)
return AccessInfo{}; // Not ours.
const uptr size = size_log == 0xf ? uc->uc_mcontext.regs[1] : 1U << size_log;
#elif defined(__x86_64__)
// Access type is encoded in the instruction following INT3 as
// NOP DWORD ptr [EAX + 0x40 + 0xXY]. For Y == 0xF, access size is stored in
// RSI register. Access address is always in RDI register.
uptr pc = (uptr)uc->uc_mcontext.gregs[REG_RIP];
uint8_t *nop = (uint8_t*)pc;
if (*nop != 0x0f || *(nop + 1) != 0x1f || *(nop + 2) != 0x40 ||
*(nop + 3) < 0x40)
return AccessInfo{}; // Not ours.
const unsigned code = *(nop + 3);
const bool is_store = code & 0x10;
const bool recover = code & 0x20;
const uptr addr = uc->uc_mcontext.gregs[REG_RDI];
const unsigned size_log = code & 0xf;
if (size_log > 4 && size_log != 0xf)
return AccessInfo{}; // Not ours.
const uptr size =
size_log == 0xf ? uc->uc_mcontext.gregs[REG_RSI] : 1U << size_log;
#else
# error Unsupported architecture
#endif
return AccessInfo{addr, size, is_store, !is_store, recover};
}
hwasan: Move memory access checks into small outlined functions on aarch64. Each hwasan check requires emitting a small piece of code like this: https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses The problem with this is that these code blocks typically bloat code size significantly. An obvious solution is to outline these blocks of code. In fact, this has already been implemented under the -hwasan-instrument-with-calls flag. However, as currently implemented this has a number of problems: - The functions use the same calling convention as regular C functions. This means that the backend must spill all temporary registers as required by the platform's C calling convention, even though the check only needs two registers on the hot path. - The functions take the address to be checked in a fixed register, which increases register pressure. Both of these factors can diminish the code size effect and increase the performance hit of -hwasan-instrument-with-calls. The solution that this patch implements is to involve the aarch64 backend in outlining the checks. An intrinsic and pseudo-instruction are created to represent a hwasan check. The pseudo-instruction is register allocated like any other instruction, and we allow the register allocator to select almost any register for the address to check. A particular combination of (register selection, type of check) triggers the creation in the backend of a function to handle the check for specifically that pair. The resulting functions are deduplicated by the linker. The pseudo-instruction (really the function) is specified to preserve all registers except for the registers that the AAPCS specifies may be clobbered by a call. To measure the code size and performance effect of this change, I took a number of measurements using Chromium for Android on aarch64, comparing a browser with inlined checks (the baseline) against a browser with outlined checks. Code size: Size of .text decreases from 243897420 to 171619972 bytes, or a 30% decrease. Performance: Using Chromium's blink_perf.layout microbenchmarks I measured a median performance regression of 6.24%. The fact that a perf/size tradeoff is evident here suggests that we might want to make the new behaviour conditional on -Os/-Oz. But for now I've enabled it unconditionally, my reasoning being that hwasan users typically expect a relatively large perf hit, and ~6% isn't really adding much. We may want to revisit this decision in the future, though. I also tried experimenting with varying the number of registers selectable by the hwasan check pseudo-instruction (which would result in fewer variants being created), on the hypothesis that creating fewer variants of the function would expose another perf/size tradeoff by reducing icache pressure from the check functions at the cost of register pressure. Although I did observe a code size increase with fewer registers, I did not observe a strong correlation between the number of registers and the performance of the resulting browser on the microbenchmarks, so I conclude that we might as well use ~all registers to get the maximum code size improvement. My results are below: Regs | .text size | Perf hit -----+------------+--------- ~all | 171619972 | 6.24% 16 | 171765192 | 7.03% 8 | 172917788 | 5.82% 4 | 177054016 | 6.89% Differential Revision: https://reviews.llvm.org/D56954 llvm-svn: 351920
2019-01-23 10:20:10 +08:00
static void HandleTagMismatch(AccessInfo ai, uptr pc, uptr frame,
[HWASan] Save + print registers when tag mismatch occurs in AArch64. Summary: This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance. In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable. The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format: Registers where the failure occurred (pc 0x0055555561b4): x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025 x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001 x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000 x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000 x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078 x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000 x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000 x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4 ... and prints after the dump of memory tags around the buggy address. Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging. Reviewers: pcc, eugenis Reviewed By: pcc, eugenis Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers Tags: #sanitizers, #llvm Differential Revision: https://reviews.llvm.org/D58857 llvm-svn: 355738
2019-03-09 05:22:35 +08:00
ucontext_t *uc, uptr *registers_frame = nullptr) {
InternalMmapVector<BufferedStackTrace> stack_buffer(1);
BufferedStackTrace *stack = stack_buffer.data();
stack->Reset();
stack->Unwind(pc, frame, uc, common_flags()->fast_unwind_on_fatal);
[HWASan] Save + print registers when tag mismatch occurs in AArch64. Summary: This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance. In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable. The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format: Registers where the failure occurred (pc 0x0055555561b4): x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025 x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001 x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000 x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000 x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078 x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000 x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000 x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4 ... and prints after the dump of memory tags around the buggy address. Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging. Reviewers: pcc, eugenis Reviewed By: pcc, eugenis Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers Tags: #sanitizers, #llvm Differential Revision: https://reviews.llvm.org/D58857 llvm-svn: 355738
2019-03-09 05:22:35 +08:00
// The second stack frame contains the failure __hwasan_check function, as
// we have a stack frame for the registers saved in __hwasan_tag_mismatch that
// we wish to ignore. This (currently) only occurs on AArch64, as x64
// implementations use SIGTRAP to implement the failure, and thus do not go
// through the stack saver.
if (registers_frame && stack->trace && stack->size > 0) {
stack->trace++;
stack->size--;
}
bool fatal = flags()->halt_on_error || !ai.recover;
[HWASan] Save + print registers when tag mismatch occurs in AArch64. Summary: This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance. In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable. The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format: Registers where the failure occurred (pc 0x0055555561b4): x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025 x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001 x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000 x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000 x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078 x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000 x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000 x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4 ... and prints after the dump of memory tags around the buggy address. Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging. Reviewers: pcc, eugenis Reviewed By: pcc, eugenis Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers Tags: #sanitizers, #llvm Differential Revision: https://reviews.llvm.org/D58857 llvm-svn: 355738
2019-03-09 05:22:35 +08:00
ReportTagMismatch(stack, ai.addr, ai.size, ai.is_store, fatal,
registers_frame);
hwasan: Move memory access checks into small outlined functions on aarch64. Each hwasan check requires emitting a small piece of code like this: https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses The problem with this is that these code blocks typically bloat code size significantly. An obvious solution is to outline these blocks of code. In fact, this has already been implemented under the -hwasan-instrument-with-calls flag. However, as currently implemented this has a number of problems: - The functions use the same calling convention as regular C functions. This means that the backend must spill all temporary registers as required by the platform's C calling convention, even though the check only needs two registers on the hot path. - The functions take the address to be checked in a fixed register, which increases register pressure. Both of these factors can diminish the code size effect and increase the performance hit of -hwasan-instrument-with-calls. The solution that this patch implements is to involve the aarch64 backend in outlining the checks. An intrinsic and pseudo-instruction are created to represent a hwasan check. The pseudo-instruction is register allocated like any other instruction, and we allow the register allocator to select almost any register for the address to check. A particular combination of (register selection, type of check) triggers the creation in the backend of a function to handle the check for specifically that pair. The resulting functions are deduplicated by the linker. The pseudo-instruction (really the function) is specified to preserve all registers except for the registers that the AAPCS specifies may be clobbered by a call. To measure the code size and performance effect of this change, I took a number of measurements using Chromium for Android on aarch64, comparing a browser with inlined checks (the baseline) against a browser with outlined checks. Code size: Size of .text decreases from 243897420 to 171619972 bytes, or a 30% decrease. Performance: Using Chromium's blink_perf.layout microbenchmarks I measured a median performance regression of 6.24%. The fact that a perf/size tradeoff is evident here suggests that we might want to make the new behaviour conditional on -Os/-Oz. But for now I've enabled it unconditionally, my reasoning being that hwasan users typically expect a relatively large perf hit, and ~6% isn't really adding much. We may want to revisit this decision in the future, though. I also tried experimenting with varying the number of registers selectable by the hwasan check pseudo-instruction (which would result in fewer variants being created), on the hypothesis that creating fewer variants of the function would expose another perf/size tradeoff by reducing icache pressure from the check functions at the cost of register pressure. Although I did observe a code size increase with fewer registers, I did not observe a strong correlation between the number of registers and the performance of the resulting browser on the microbenchmarks, so I conclude that we might as well use ~all registers to get the maximum code size improvement. My results are below: Regs | .text size | Perf hit -----+------------+--------- ~all | 171619972 | 6.24% 16 | 171765192 | 7.03% 8 | 172917788 | 5.82% 4 | 177054016 | 6.89% Differential Revision: https://reviews.llvm.org/D56954 llvm-svn: 351920
2019-01-23 10:20:10 +08:00
}
static bool HwasanOnSIGTRAP(int signo, siginfo_t *info, ucontext_t *uc) {
AccessInfo ai = GetAccessInfo(info, uc);
if (!ai.is_store && !ai.is_load)
return false;
SignalContext sig{info, uc};
HandleTagMismatch(ai, StackTrace::GetNextInstructionPc(sig.pc), sig.bp, uc);
#if defined(__aarch64__)
uc->uc_mcontext.pc += 4;
#elif defined(__x86_64__)
#else
# error Unsupported architecture
#endif
return true;
}
[HWASan] Save + print registers when tag mismatch occurs in AArch64. Summary: This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance. In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable. The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format: Registers where the failure occurred (pc 0x0055555561b4): x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025 x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001 x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000 x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000 x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078 x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000 x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000 x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4 ... and prints after the dump of memory tags around the buggy address. Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging. Reviewers: pcc, eugenis Reviewed By: pcc, eugenis Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers Tags: #sanitizers, #llvm Differential Revision: https://reviews.llvm.org/D58857 llvm-svn: 355738
2019-03-09 05:22:35 +08:00
// Entry point stub for interoperability between __hwasan_tag_mismatch (ASM) and
// the rest of the mismatch handling code (C++).
extern "C" void __hwasan_tag_mismatch_stub(uptr addr, uptr access_info,
uptr *registers_frame) {
hwasan: Move memory access checks into small outlined functions on aarch64. Each hwasan check requires emitting a small piece of code like this: https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses The problem with this is that these code blocks typically bloat code size significantly. An obvious solution is to outline these blocks of code. In fact, this has already been implemented under the -hwasan-instrument-with-calls flag. However, as currently implemented this has a number of problems: - The functions use the same calling convention as regular C functions. This means that the backend must spill all temporary registers as required by the platform's C calling convention, even though the check only needs two registers on the hot path. - The functions take the address to be checked in a fixed register, which increases register pressure. Both of these factors can diminish the code size effect and increase the performance hit of -hwasan-instrument-with-calls. The solution that this patch implements is to involve the aarch64 backend in outlining the checks. An intrinsic and pseudo-instruction are created to represent a hwasan check. The pseudo-instruction is register allocated like any other instruction, and we allow the register allocator to select almost any register for the address to check. A particular combination of (register selection, type of check) triggers the creation in the backend of a function to handle the check for specifically that pair. The resulting functions are deduplicated by the linker. The pseudo-instruction (really the function) is specified to preserve all registers except for the registers that the AAPCS specifies may be clobbered by a call. To measure the code size and performance effect of this change, I took a number of measurements using Chromium for Android on aarch64, comparing a browser with inlined checks (the baseline) against a browser with outlined checks. Code size: Size of .text decreases from 243897420 to 171619972 bytes, or a 30% decrease. Performance: Using Chromium's blink_perf.layout microbenchmarks I measured a median performance regression of 6.24%. The fact that a perf/size tradeoff is evident here suggests that we might want to make the new behaviour conditional on -Os/-Oz. But for now I've enabled it unconditionally, my reasoning being that hwasan users typically expect a relatively large perf hit, and ~6% isn't really adding much. We may want to revisit this decision in the future, though. I also tried experimenting with varying the number of registers selectable by the hwasan check pseudo-instruction (which would result in fewer variants being created), on the hypothesis that creating fewer variants of the function would expose another perf/size tradeoff by reducing icache pressure from the check functions at the cost of register pressure. Although I did observe a code size increase with fewer registers, I did not observe a strong correlation between the number of registers and the performance of the resulting browser on the microbenchmarks, so I conclude that we might as well use ~all registers to get the maximum code size improvement. My results are below: Regs | .text size | Perf hit -----+------------+--------- ~all | 171619972 | 6.24% 16 | 171765192 | 7.03% 8 | 172917788 | 5.82% 4 | 177054016 | 6.89% Differential Revision: https://reviews.llvm.org/D56954 llvm-svn: 351920
2019-01-23 10:20:10 +08:00
AccessInfo ai;
ai.is_store = access_info & 0x10;
ai.recover = false;
ai.addr = addr;
ai.size = 1 << (access_info & 0xf);
HandleTagMismatch(ai, (uptr)__builtin_return_address(0),
[HWASan] Save + print registers when tag mismatch occurs in AArch64. Summary: This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance. In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable. The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format: Registers where the failure occurred (pc 0x0055555561b4): x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025 x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001 x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000 x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000 x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078 x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000 x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000 x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4 ... and prints after the dump of memory tags around the buggy address. Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging. Reviewers: pcc, eugenis Reviewed By: pcc, eugenis Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers Tags: #sanitizers, #llvm Differential Revision: https://reviews.llvm.org/D58857 llvm-svn: 355738
2019-03-09 05:22:35 +08:00
(uptr)__builtin_frame_address(0), nullptr, registers_frame);
hwasan: Move memory access checks into small outlined functions on aarch64. Each hwasan check requires emitting a small piece of code like this: https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses The problem with this is that these code blocks typically bloat code size significantly. An obvious solution is to outline these blocks of code. In fact, this has already been implemented under the -hwasan-instrument-with-calls flag. However, as currently implemented this has a number of problems: - The functions use the same calling convention as regular C functions. This means that the backend must spill all temporary registers as required by the platform's C calling convention, even though the check only needs two registers on the hot path. - The functions take the address to be checked in a fixed register, which increases register pressure. Both of these factors can diminish the code size effect and increase the performance hit of -hwasan-instrument-with-calls. The solution that this patch implements is to involve the aarch64 backend in outlining the checks. An intrinsic and pseudo-instruction are created to represent a hwasan check. The pseudo-instruction is register allocated like any other instruction, and we allow the register allocator to select almost any register for the address to check. A particular combination of (register selection, type of check) triggers the creation in the backend of a function to handle the check for specifically that pair. The resulting functions are deduplicated by the linker. The pseudo-instruction (really the function) is specified to preserve all registers except for the registers that the AAPCS specifies may be clobbered by a call. To measure the code size and performance effect of this change, I took a number of measurements using Chromium for Android on aarch64, comparing a browser with inlined checks (the baseline) against a browser with outlined checks. Code size: Size of .text decreases from 243897420 to 171619972 bytes, or a 30% decrease. Performance: Using Chromium's blink_perf.layout microbenchmarks I measured a median performance regression of 6.24%. The fact that a perf/size tradeoff is evident here suggests that we might want to make the new behaviour conditional on -Os/-Oz. But for now I've enabled it unconditionally, my reasoning being that hwasan users typically expect a relatively large perf hit, and ~6% isn't really adding much. We may want to revisit this decision in the future, though. I also tried experimenting with varying the number of registers selectable by the hwasan check pseudo-instruction (which would result in fewer variants being created), on the hypothesis that creating fewer variants of the function would expose another perf/size tradeoff by reducing icache pressure from the check functions at the cost of register pressure. Although I did observe a code size increase with fewer registers, I did not observe a strong correlation between the number of registers and the performance of the resulting browser on the microbenchmarks, so I conclude that we might as well use ~all registers to get the maximum code size improvement. My results are below: Regs | .text size | Perf hit -----+------------+--------- ~all | 171619972 | 6.24% 16 | 171765192 | 7.03% 8 | 172917788 | 5.82% 4 | 177054016 | 6.89% Differential Revision: https://reviews.llvm.org/D56954 llvm-svn: 351920
2019-01-23 10:20:10 +08:00
__builtin_unreachable();
}
static void OnStackUnwind(const SignalContext &sig, const void *,
BufferedStackTrace *stack) {
stack->Unwind(StackTrace::GetNextInstructionPc(sig.pc), sig.bp, sig.context,
common_flags()->fast_unwind_on_fatal);
}
void HwasanOnDeadlySignal(int signo, void *info, void *context) {
// Probably a tag mismatch.
if (signo == SIGTRAP)
if (HwasanOnSIGTRAP(signo, (siginfo_t *)info, (ucontext_t*)context))
return;
HandleDeadlySignal(info, context, GetTid(), &OnStackUnwind, nullptr);
}
} // namespace __hwasan
#endif // SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_NETBSD