llvm-project/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

81 lines
4.1 KiB
LLVM
Raw Normal View History

; Test alloca instrumentation.
;
; RUN: opt < %s -hwasan -hwasan-with-ifunc=1 -S | FileCheck %s --check-prefixes=CHECK,DYNAMIC-SHADOW,NO-UAR-TAGS
; RUN: opt < %s -hwasan -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=CHECK,ZERO-BASED-SHADOW,NO-UAR-TAGS
; RUN: opt < %s -hwasan -hwasan-with-ifunc=1 -hwasan-uar-retag-to-zero=0 -S | FileCheck %s --check-prefixes=CHECK,DYNAMIC-SHADOW,UAR-TAGS
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-android10000"
declare void @use32(i32*)
define void @test_alloca() sanitize_hwaddress !dbg !15 {
; CHECK-LABEL: @test_alloca(
; CHECK: %[[FP:[^ ]*]] = call i8* @llvm.frameaddress.p0i8(i32 0)
; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %[[FP]] to i64
; CHECK: %[[B:[^ ]*]] = lshr i64 %[[A]], 20
; CHECK: %[[BASE_TAG:[^ ]*]] = xor i64 %[[A]], %[[B]]
hwasan: Improve precision of checks using short granule tags. A short granule is a granule of size between 1 and `TG-1` bytes. The size of a short granule is stored at the location in shadow memory where the granule's tag is normally stored, while the granule's actual tag is stored in the last byte of the granule. This means that in order to verify that a pointer tag matches a memory tag, HWASAN must check for two possibilities: * the pointer tag is equal to the memory tag in shadow memory, or * the shadow memory tag is actually a short granule size, the value being loaded is in bounds of the granule and the pointer tag is equal to the last byte of the granule. Pointer tags between 1 to `TG-1` are possible and are as likely as any other tag. This means that these tags in memory have two interpretations: the full tag interpretation (where the pointer tag is between 1 and `TG-1` and the last byte of the granule is ordinary data) and the short tag interpretation (where the pointer tag is stored in the granule). When HWASAN detects an error near a memory tag between 1 and `TG-1`, it will show both the memory tag and the last byte of the granule. Currently, it is up to the user to disambiguate the two possibilities. Because this functionality obsoletes the right aligned heap feature of the HWASAN memory allocator (and because we can no longer easily test it), the feature is removed. Also update the documentation to cover both short granule tags and outlined checks. Differential Revision: https://reviews.llvm.org/D63908 llvm-svn: 365551
2019-07-10 04:22:36 +08:00
; CHECK: %[[X:[^ ]*]] = alloca { i32, [12 x i8] }, align 16
; CHECK: %[[X_BC:[^ ]*]] = bitcast { i32, [12 x i8] }* %[[X]] to i32*
; CHECK: %[[X_TAG:[^ ]*]] = xor i64 %[[BASE_TAG]], 0
; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X_BC]] to i64
; CHECK: %[[C:[^ ]*]] = shl i64 %[[X_TAG]], 56
; CHECK: %[[D:[^ ]*]] = or i64 %[[X1]], %[[C]]
; CHECK: %[[X_HWASAN:[^ ]*]] = inttoptr i64 %[[D]] to i32*
; CHECK: %[[X_TAG2:[^ ]*]] = trunc i64 %[[X_TAG]] to i8
; CHECK: %[[E:[^ ]*]] = ptrtoint i32* %[[X_BC]] to i64
; CHECK: %[[F:[^ ]*]] = lshr i64 %[[E]], 4
hwasan: Move memory access checks into small outlined functions on aarch64. Each hwasan check requires emitting a small piece of code like this: https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses The problem with this is that these code blocks typically bloat code size significantly. An obvious solution is to outline these blocks of code. In fact, this has already been implemented under the -hwasan-instrument-with-calls flag. However, as currently implemented this has a number of problems: - The functions use the same calling convention as regular C functions. This means that the backend must spill all temporary registers as required by the platform's C calling convention, even though the check only needs two registers on the hot path. - The functions take the address to be checked in a fixed register, which increases register pressure. Both of these factors can diminish the code size effect and increase the performance hit of -hwasan-instrument-with-calls. The solution that this patch implements is to involve the aarch64 backend in outlining the checks. An intrinsic and pseudo-instruction are created to represent a hwasan check. The pseudo-instruction is register allocated like any other instruction, and we allow the register allocator to select almost any register for the address to check. A particular combination of (register selection, type of check) triggers the creation in the backend of a function to handle the check for specifically that pair. The resulting functions are deduplicated by the linker. The pseudo-instruction (really the function) is specified to preserve all registers except for the registers that the AAPCS specifies may be clobbered by a call. To measure the code size and performance effect of this change, I took a number of measurements using Chromium for Android on aarch64, comparing a browser with inlined checks (the baseline) against a browser with outlined checks. Code size: Size of .text decreases from 243897420 to 171619972 bytes, or a 30% decrease. Performance: Using Chromium's blink_perf.layout microbenchmarks I measured a median performance regression of 6.24%. The fact that a perf/size tradeoff is evident here suggests that we might want to make the new behaviour conditional on -Os/-Oz. But for now I've enabled it unconditionally, my reasoning being that hwasan users typically expect a relatively large perf hit, and ~6% isn't really adding much. We may want to revisit this decision in the future, though. I also tried experimenting with varying the number of registers selectable by the hwasan check pseudo-instruction (which would result in fewer variants being created), on the hypothesis that creating fewer variants of the function would expose another perf/size tradeoff by reducing icache pressure from the check functions at the cost of register pressure. Although I did observe a code size increase with fewer registers, I did not observe a strong correlation between the number of registers and the performance of the resulting browser on the microbenchmarks, so I conclude that we might as well use ~all registers to get the maximum code size improvement. My results are below: Regs | .text size | Perf hit -----+------------+--------- ~all | 171619972 | 6.24% 16 | 171765192 | 7.03% 8 | 172917788 | 5.82% 4 | 177054016 | 6.89% Differential Revision: https://reviews.llvm.org/D56954 llvm-svn: 351920
2019-01-23 10:20:10 +08:00
; DYNAMIC-SHADOW: %[[X_SHADOW:[^ ]*]] = getelementptr i8, i8* %.hwasan.shadow, i64 %[[F]]
; ZERO-BASED-SHADOW: %[[X_SHADOW:[^ ]*]] = inttoptr i64 %[[F]] to i8*
hwasan: Improve precision of checks using short granule tags. A short granule is a granule of size between 1 and `TG-1` bytes. The size of a short granule is stored at the location in shadow memory where the granule's tag is normally stored, while the granule's actual tag is stored in the last byte of the granule. This means that in order to verify that a pointer tag matches a memory tag, HWASAN must check for two possibilities: * the pointer tag is equal to the memory tag in shadow memory, or * the shadow memory tag is actually a short granule size, the value being loaded is in bounds of the granule and the pointer tag is equal to the last byte of the granule. Pointer tags between 1 to `TG-1` are possible and are as likely as any other tag. This means that these tags in memory have two interpretations: the full tag interpretation (where the pointer tag is between 1 and `TG-1` and the last byte of the granule is ordinary data) and the short tag interpretation (where the pointer tag is stored in the granule). When HWASAN detects an error near a memory tag between 1 and `TG-1`, it will show both the memory tag and the last byte of the granule. Currently, it is up to the user to disambiguate the two possibilities. Because this functionality obsoletes the right aligned heap feature of the HWASAN memory allocator (and because we can no longer easily test it), the feature is removed. Also update the documentation to cover both short granule tags and outlined checks. Differential Revision: https://reviews.llvm.org/D63908 llvm-svn: 365551
2019-07-10 04:22:36 +08:00
; CHECK: %[[X_SHADOW_GEP:[^ ]*]] = getelementptr i8, i8* %[[X_SHADOW]], i32 0
; CHECK: store i8 4, i8* %[[X_SHADOW_GEP]]
; CHECK: %[[X_I8:[^ ]*]] = bitcast i32* %[[X_BC]] to i8*
hwasan: Improve precision of checks using short granule tags. A short granule is a granule of size between 1 and `TG-1` bytes. The size of a short granule is stored at the location in shadow memory where the granule's tag is normally stored, while the granule's actual tag is stored in the last byte of the granule. This means that in order to verify that a pointer tag matches a memory tag, HWASAN must check for two possibilities: * the pointer tag is equal to the memory tag in shadow memory, or * the shadow memory tag is actually a short granule size, the value being loaded is in bounds of the granule and the pointer tag is equal to the last byte of the granule. Pointer tags between 1 to `TG-1` are possible and are as likely as any other tag. This means that these tags in memory have two interpretations: the full tag interpretation (where the pointer tag is between 1 and `TG-1` and the last byte of the granule is ordinary data) and the short tag interpretation (where the pointer tag is stored in the granule). When HWASAN detects an error near a memory tag between 1 and `TG-1`, it will show both the memory tag and the last byte of the granule. Currently, it is up to the user to disambiguate the two possibilities. Because this functionality obsoletes the right aligned heap feature of the HWASAN memory allocator (and because we can no longer easily test it), the feature is removed. Also update the documentation to cover both short granule tags and outlined checks. Differential Revision: https://reviews.llvm.org/D63908 llvm-svn: 365551
2019-07-10 04:22:36 +08:00
; CHECK: %[[X_I8_GEP:[^ ]*]] = getelementptr i8, i8* %[[X_I8]], i32 15
; CHECK: store i8 %[[X_TAG2]], i8* %[[X_I8_GEP]]
; CHECK: call void @llvm.dbg.value(
; CHECK-SAME: metadata !DIArgList(i32* %[[X_BC]], i32* %[[X_BC]])
; CHECK-SAME: metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0,
; CHECK: call void @use32(i32* nonnull %[[X_HWASAN]])
; UAR-TAGS: %[[BASE_TAG_COMPL:[^ ]*]] = xor i64 %[[BASE_TAG]], 255
; UAR-TAGS: %[[X_TAG_UAR:[^ ]*]] = trunc i64 %[[BASE_TAG_COMPL]] to i8
; CHECK: %[[E2:[^ ]*]] = ptrtoint i32* %[[X_BC]] to i64
; CHECK: %[[F2:[^ ]*]] = lshr i64 %[[E2]], 4
hwasan: Move memory access checks into small outlined functions on aarch64. Each hwasan check requires emitting a small piece of code like this: https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses The problem with this is that these code blocks typically bloat code size significantly. An obvious solution is to outline these blocks of code. In fact, this has already been implemented under the -hwasan-instrument-with-calls flag. However, as currently implemented this has a number of problems: - The functions use the same calling convention as regular C functions. This means that the backend must spill all temporary registers as required by the platform's C calling convention, even though the check only needs two registers on the hot path. - The functions take the address to be checked in a fixed register, which increases register pressure. Both of these factors can diminish the code size effect and increase the performance hit of -hwasan-instrument-with-calls. The solution that this patch implements is to involve the aarch64 backend in outlining the checks. An intrinsic and pseudo-instruction are created to represent a hwasan check. The pseudo-instruction is register allocated like any other instruction, and we allow the register allocator to select almost any register for the address to check. A particular combination of (register selection, type of check) triggers the creation in the backend of a function to handle the check for specifically that pair. The resulting functions are deduplicated by the linker. The pseudo-instruction (really the function) is specified to preserve all registers except for the registers that the AAPCS specifies may be clobbered by a call. To measure the code size and performance effect of this change, I took a number of measurements using Chromium for Android on aarch64, comparing a browser with inlined checks (the baseline) against a browser with outlined checks. Code size: Size of .text decreases from 243897420 to 171619972 bytes, or a 30% decrease. Performance: Using Chromium's blink_perf.layout microbenchmarks I measured a median performance regression of 6.24%. The fact that a perf/size tradeoff is evident here suggests that we might want to make the new behaviour conditional on -Os/-Oz. But for now I've enabled it unconditionally, my reasoning being that hwasan users typically expect a relatively large perf hit, and ~6% isn't really adding much. We may want to revisit this decision in the future, though. I also tried experimenting with varying the number of registers selectable by the hwasan check pseudo-instruction (which would result in fewer variants being created), on the hypothesis that creating fewer variants of the function would expose another perf/size tradeoff by reducing icache pressure from the check functions at the cost of register pressure. Although I did observe a code size increase with fewer registers, I did not observe a strong correlation between the number of registers and the performance of the resulting browser on the microbenchmarks, so I conclude that we might as well use ~all registers to get the maximum code size improvement. My results are below: Regs | .text size | Perf hit -----+------------+--------- ~all | 171619972 | 6.24% 16 | 171765192 | 7.03% 8 | 172917788 | 5.82% 4 | 177054016 | 6.89% Differential Revision: https://reviews.llvm.org/D56954 llvm-svn: 351920
2019-01-23 10:20:10 +08:00
; DYNAMIC-SHADOW: %[[X_SHADOW2:[^ ]*]] = getelementptr i8, i8* %.hwasan.shadow, i64 %[[F2]]
; ZERO-BASED-SHADOW: %[[X_SHADOW2:[^ ]*]] = inttoptr i64 %[[F2]] to i8*
; NO-UAR-TAGS: call void @llvm.memset.p0i8.i64(i8* align 1 %[[X_SHADOW2]], i8 0, i64 1, i1 false)
; UAR-TAGS: call void @llvm.memset.p0i8.i64(i8* align 1 %[[X_SHADOW2]], i8 %[[X_TAG_UAR]], i64 1, i1 false)
; CHECK: ret void
entry:
%x = alloca i32, align 4
call void @llvm.dbg.value(metadata !DIArgList(i32* %x, i32* %x), metadata !22, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_deref)), !dbg !21
call void @use32(i32* nonnull %x), !dbg !23
ret void, !dbg !24
}
declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!llvm.ident = !{!14}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "alloca.cpp", directory: "/")
!2 = !{}
!3 = !{i32 7, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!14 = !{!"clang version 13.0.0"}
!15 = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !1, file: !1, line: 4, type: !16, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
!16 = !DISubroutineType(types: !17)
!17 = !{null}
!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64)
!20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!21 = !DILocation(line: 0, scope: !15)
!22 = !DILocalVariable(name: "x", scope: !15, file: !1, line: 5, type: !20)
!23 = !DILocation(line: 7, column: 5, scope: !15)
!24 = !DILocation(line: 8, column: 1, scope: !15)