2017-07-26 07:51:02 +08:00
|
|
|
//===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===//
|
2014-03-29 18:18:08 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2014-03-29 18:18:08 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file contains a printer that converts from our internal representation
|
2014-05-24 20:50:23 +08:00
|
|
|
// of machine-dependent LLVM code to the AArch64 assembly language.
|
2014-03-29 18:18:08 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
#include "AArch64.h"
|
|
|
|
#include "AArch64MCInstLower.h"
|
2014-07-25 19:42:14 +08:00
|
|
|
#include "AArch64MachineFunctionInfo.h"
|
2014-05-24 20:50:23 +08:00
|
|
|
#include "AArch64RegisterInfo.h"
|
|
|
|
#include "AArch64Subtarget.h"
|
2017-08-31 16:28:48 +08:00
|
|
|
#include "AArch64TargetObjectFile.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "MCTargetDesc/AArch64AddressingModes.h"
|
2019-05-11 07:50:01 +08:00
|
|
|
#include "MCTargetDesc/AArch64InstPrinter.h"
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
#include "MCTargetDesc/AArch64MCExpr.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "MCTargetDesc/AArch64MCTargetDesc.h"
|
2018-10-27 14:13:06 +08:00
|
|
|
#include "MCTargetDesc/AArch64TargetStreamer.h"
|
2019-05-15 05:33:53 +08:00
|
|
|
#include "TargetInfo/AArch64TargetInfo.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "Utils/AArch64BaseInfo.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/ADT/SmallString.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include "llvm/ADT/Triple.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/ADT/Twine.h"
|
2018-12-12 02:36:14 +08:00
|
|
|
#include "llvm/BinaryFormat/COFF.h"
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
#include "llvm/BinaryFormat/ELF.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/CodeGen/AsmPrinter.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2018-10-25 04:19:09 +08:00
|
|
|
#include "llvm/CodeGen/MachineJumpTableInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
2014-07-25 19:42:14 +08:00
|
|
|
#include "llvm/CodeGen/StackMaps.h"
|
2017-11-17 09:07:10 +08:00
|
|
|
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/IR/DataLayout.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "llvm/IR/DebugInfoMetadata.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/MC/MCAsmInfo.h"
|
|
|
|
#include "llvm/MC/MCContext.h"
|
|
|
|
#include "llvm/MC/MCInst.h"
|
|
|
|
#include "llvm/MC/MCInstBuilder.h"
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
#include "llvm/MC/MCSectionELF.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/MC/MCStreamer.h"
|
2015-03-06 04:04:21 +08:00
|
|
|
#include "llvm/MC/MCSymbol.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
2015-03-24 03:32:43 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2017-07-26 07:51:02 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 10:41:26 +08:00
|
|
|
#define DEBUG_TYPE "asm-printer"
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
namespace {
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
class AArch64AsmPrinter : public AsmPrinter {
|
|
|
|
AArch64MCInstLower MCInstLowering;
|
2014-03-29 18:18:08 +08:00
|
|
|
StackMaps SM;
|
2016-07-07 05:39:33 +08:00
|
|
|
const AArch64Subtarget *STI;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
public:
|
2015-01-19 04:29:04 +08:00
|
|
|
AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
|
2015-02-03 14:40:19 +08:00
|
|
|
: AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
|
2017-07-26 07:51:02 +08:00
|
|
|
SM(*this) {}
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Wrapper for MCInstLowering.lowerOperand() for the
|
2014-03-29 18:18:08 +08:00
|
|
|
/// tblgen'erated pseudo lowering.
|
|
|
|
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
|
|
|
|
return MCInstLowering.lowerOperand(MO, MCOp);
|
|
|
|
}
|
|
|
|
|
2018-10-25 04:19:09 +08:00
|
|
|
void EmitJumpTableInfo() override;
|
|
|
|
void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
|
|
|
|
const MachineBasicBlock *MBB, unsigned JTI);
|
|
|
|
|
|
|
|
void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
|
|
|
|
const MachineInstr &MI);
|
|
|
|
void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
|
|
|
|
const MachineInstr &MI);
|
2016-11-17 13:15:37 +08:00
|
|
|
|
|
|
|
void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
|
|
|
|
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
|
|
|
|
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
|
|
|
|
|
2019-09-27 09:35:04 +08:00
|
|
|
typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple;
|
|
|
|
std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI);
|
|
|
|
void EmitHwasanMemaccessSymbols(Module &M);
|
|
|
|
|
2016-11-17 13:15:37 +08:00
|
|
|
void EmitSled(const MachineInstr &MI, SledKind Kind);
|
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// tblgen'erated driver function for lowering simple MI->MC
|
2014-03-29 18:18:08 +08:00
|
|
|
/// pseudo instructions.
|
|
|
|
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
|
|
|
|
const MachineInstr *MI);
|
|
|
|
|
2014-04-29 15:58:25 +08:00
|
|
|
void EmitInstruction(const MachineInstr *MI) override;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-04-29 15:58:25 +08:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2014-03-29 18:18:08 +08:00
|
|
|
AsmPrinter::getAnalysisUsage(AU);
|
|
|
|
AU.setPreservesAll();
|
|
|
|
}
|
|
|
|
|
2018-12-12 02:36:14 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
AArch64FI = MF.getInfo<AArch64FunctionInfo>();
|
|
|
|
STI = static_cast<const AArch64Subtarget*>(&MF.getSubtarget());
|
|
|
|
|
|
|
|
SetupMachineFunction(MF);
|
|
|
|
|
|
|
|
if (STI->isTargetCOFF()) {
|
|
|
|
bool Internal = MF.getFunction().hasInternalLinkage();
|
|
|
|
COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
|
|
|
|
: COFF::IMAGE_SYM_CLASS_EXTERNAL;
|
|
|
|
int Type =
|
|
|
|
COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
|
|
|
|
|
|
|
|
OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
|
|
|
|
OutStreamer->EmitCOFFSymbolStorageClass(Scl);
|
|
|
|
OutStreamer->EmitCOFFSymbolType(Type);
|
|
|
|
OutStreamer->EndCOFFSymbolDef();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the rest of the function body.
|
|
|
|
EmitFunctionBody();
|
|
|
|
|
|
|
|
// Emit the XRay table for this function.
|
2017-01-03 12:30:21 +08:00
|
|
|
emitXRayTable();
|
2018-12-12 02:36:14 +08:00
|
|
|
|
|
|
|
// We didn't modify anything.
|
|
|
|
return false;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
|
|
|
|
bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
|
|
|
|
bool printAsmRegInClass(const MachineOperand &MO,
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
const TargetRegisterClass *RC, unsigned AltName,
|
2014-03-29 18:18:08 +08:00
|
|
|
raw_ostream &O);
|
|
|
|
|
|
|
|
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
|
[AsmPrinter] refactor to remove remove AsmVariant. NFC
Summary:
The InlineAsm::AsmDialect is only required for X86; no architecture
makes use of it and as such it gets passed around between arch-specific
and general code while being unused for all architectures but X86.
Since the AsmDialect is queried from a MachineInstr, which we also pass
around, remove the additional AsmDialect parameter and query for it deep
in the X86AsmPrinter only when needed/as late as possible.
This refactor should help later planned refactors to AsmPrinter, as this
difference in the X86AsmPrinter makes it harder to make AsmPrinter more
generic.
Reviewers: craig.topper
Subscribers: jholewinski, arsenm, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, eraman, hiraditya, aheejin, kbarton, fedor.sergeev, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, llvm-commits, peter.smith, srhines
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60488
llvm-svn: 358101
2019-04-11 00:38:43 +08:00
|
|
|
const char *ExtraCode, raw_ostream &O) override;
|
2014-03-29 18:18:08 +08:00
|
|
|
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
|
[AsmPrinter] refactor to remove remove AsmVariant. NFC
Summary:
The InlineAsm::AsmDialect is only required for X86; no architecture
makes use of it and as such it gets passed around between arch-specific
and general code while being unused for all architectures but X86.
Since the AsmDialect is queried from a MachineInstr, which we also pass
around, remove the additional AsmDialect parameter and query for it deep
in the X86AsmPrinter only when needed/as late as possible.
This refactor should help later planned refactors to AsmPrinter, as this
difference in the X86AsmPrinter makes it harder to make AsmPrinter more
generic.
Reviewers: craig.topper
Subscribers: jholewinski, arsenm, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, eraman, hiraditya, aheejin, kbarton, fedor.sergeev, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, llvm-commits, peter.smith, srhines
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60488
llvm-svn: 358101
2019-04-11 00:38:43 +08:00
|
|
|
const char *ExtraCode, raw_ostream &O) override;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
|
|
|
|
|
2014-04-29 15:58:25 +08:00
|
|
|
void EmitFunctionBodyEnd() override;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-04-29 15:58:25 +08:00
|
|
|
MCSymbol *GetCPISymbol(unsigned CPID) const override;
|
|
|
|
void EmitEndOfAsmFile(Module &M) override;
|
2017-07-26 07:51:02 +08:00
|
|
|
|
|
|
|
AArch64FunctionInfo *AArch64FI = nullptr;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Emit the LOHs contained in AArch64FI.
|
2014-03-29 18:18:08 +08:00
|
|
|
void EmitLOHs();
|
|
|
|
|
2016-07-07 05:39:33 +08:00
|
|
|
/// Emit instruction to set float register to zero.
|
|
|
|
void EmitFMov0(const MachineInstr &MI);
|
|
|
|
|
2017-07-26 07:51:02 +08:00
|
|
|
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
MInstToMCSymbol LOHInstToLabel;
|
|
|
|
};
|
|
|
|
|
2017-07-26 07:51:02 +08:00
|
|
|
} // end anonymous namespace
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2016-11-17 13:15:37 +08:00
|
|
|
void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
|
|
|
|
{
|
|
|
|
EmitSled(MI, SledKind::FUNCTION_ENTER);
|
|
|
|
}
|
|
|
|
|
|
|
|
void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
|
|
|
|
{
|
|
|
|
EmitSled(MI, SledKind::FUNCTION_EXIT);
|
|
|
|
}
|
|
|
|
|
|
|
|
void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
|
|
|
|
{
|
|
|
|
EmitSled(MI, SledKind::TAIL_CALL);
|
|
|
|
}
|
|
|
|
|
|
|
|
void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
|
|
|
|
{
|
|
|
|
static const int8_t NoopsInSledCount = 7;
|
|
|
|
// We want to emit the following pattern:
|
|
|
|
//
|
|
|
|
// .Lxray_sled_N:
|
|
|
|
// ALIGN
|
|
|
|
// B #32
|
|
|
|
// ; 7 NOP instructions (28 bytes)
|
|
|
|
// .tmpN
|
|
|
|
//
|
|
|
|
// We need the 28 bytes (7 instructions) because at runtime, we'd be patching
|
|
|
|
// over the full 32 bytes (8 instructions) with the following pattern:
|
|
|
|
//
|
|
|
|
// STP X0, X30, [SP, #-16]! ; push X0 and the link register to the stack
|
|
|
|
// LDR W0, #12 ; W0 := function ID
|
|
|
|
// LDR X16,#12 ; X16 := addr of __xray_FunctionEntry or __xray_FunctionExit
|
|
|
|
// BLR X16 ; call the tracing trampoline
|
|
|
|
// ;DATA: 32 bits of function ID
|
|
|
|
// ;DATA: lower 32 bits of the address of the trampoline
|
|
|
|
// ;DATA: higher 32 bits of the address of the trampoline
|
|
|
|
// LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
|
|
|
|
//
|
|
|
|
OutStreamer->EmitCodeAlignment(4);
|
|
|
|
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
|
|
|
|
OutStreamer->EmitLabel(CurSled);
|
|
|
|
auto Target = OutContext.createTempSymbol();
|
|
|
|
|
|
|
|
// Emit "B #32" instruction, which jumps over the next 28 bytes.
|
2016-11-21 11:01:43 +08:00
|
|
|
// The operand has to be the number of 4-byte instructions to jump over,
|
|
|
|
// including the current instruction.
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B).addImm(8));
|
2016-11-17 13:15:37 +08:00
|
|
|
|
|
|
|
for (int8_t I = 0; I < NoopsInSledCount; I++)
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
|
|
|
|
|
|
|
|
OutStreamer->EmitLabel(Target);
|
|
|
|
recordSled(CurSled, MI, Kind);
|
|
|
|
}
|
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register Reg = MI.getOperand(0).getReg();
|
2019-09-27 09:02:10 +08:00
|
|
|
bool IsShort =
|
|
|
|
MI.getOpcode() == AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES;
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
uint32_t AccessInfo = MI.getOperand(1).getImm();
|
2019-09-27 09:35:04 +08:00
|
|
|
MCSymbol *&Sym =
|
|
|
|
HwasanMemaccessSymbols[HwasanMemaccessTuple(Reg, IsShort, AccessInfo)];
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
if (!Sym) {
|
|
|
|
// FIXME: Make this work on non-ELF.
|
|
|
|
if (!TM.getTargetTriple().isOSBinFormatELF())
|
|
|
|
report_fatal_error("llvm.hwasan.check.memaccess only supported on ELF");
|
|
|
|
|
|
|
|
std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
|
|
|
|
utostr(AccessInfo);
|
2019-09-27 09:02:10 +08:00
|
|
|
if (IsShort)
|
|
|
|
SymName += "_short";
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
Sym = OutContext.getOrCreateSymbol(SymName);
|
|
|
|
}
|
|
|
|
|
|
|
|
EmitToStreamer(*OutStreamer,
|
|
|
|
MCInstBuilder(AArch64::BL)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(Sym, OutContext)));
|
|
|
|
}
|
|
|
|
|
|
|
|
void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
|
|
|
|
if (HwasanMemaccessSymbols.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
const Triple &TT = TM.getTargetTriple();
|
|
|
|
assert(TT.isOSBinFormatELF());
|
|
|
|
std::unique_ptr<MCSubtargetInfo> STI(
|
|
|
|
TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
|
|
|
|
|
2019-09-27 09:02:10 +08:00
|
|
|
MCSymbol *HwasanTagMismatchV1Sym =
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
|
2019-09-27 09:02:10 +08:00
|
|
|
MCSymbol *HwasanTagMismatchV2Sym =
|
|
|
|
OutContext.getOrCreateSymbol("__hwasan_tag_mismatch_v2");
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
|
2019-09-27 09:02:10 +08:00
|
|
|
const MCSymbolRefExpr *HwasanTagMismatchV1Ref =
|
|
|
|
MCSymbolRefExpr::create(HwasanTagMismatchV1Sym, OutContext);
|
|
|
|
const MCSymbolRefExpr *HwasanTagMismatchV2Ref =
|
|
|
|
MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext);
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
for (auto &P : HwasanMemaccessSymbols) {
|
2019-09-27 09:02:10 +08:00
|
|
|
unsigned Reg = std::get<0>(P.first);
|
|
|
|
bool IsShort = std::get<1>(P.first);
|
|
|
|
uint32_t AccessInfo = std::get<2>(P.first);
|
|
|
|
const MCSymbolRefExpr *HwasanTagMismatchRef =
|
|
|
|
IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref;
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
MCSymbol *Sym = P.second;
|
|
|
|
|
|
|
|
OutStreamer->SwitchSection(OutContext.getELFSection(
|
|
|
|
".text.hot", ELF::SHT_PROGBITS,
|
|
|
|
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
|
|
|
|
Sym->getName()));
|
|
|
|
|
|
|
|
OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
|
|
|
|
OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak);
|
|
|
|
OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden);
|
|
|
|
OutStreamer->EmitLabel(Sym);
|
|
|
|
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addReg(Reg)
|
|
|
|
.addImm(4)
|
|
|
|
.addImm(55),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX)
|
|
|
|
.addReg(AArch64::W16)
|
|
|
|
.addReg(AArch64::X9)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addImm(0)
|
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
2019-06-20 04:40:03 +08:00
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::SUBSXrs)
|
|
|
|
.addReg(AArch64::XZR)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addReg(Reg)
|
|
|
|
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
|
|
|
|
*STI);
|
2019-09-27 09:02:10 +08:00
|
|
|
MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::Bcc)
|
|
|
|
.addImm(AArch64CC::NE)
|
2019-09-27 09:02:10 +08:00
|
|
|
.addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
|
|
|
|
OutContext)),
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
*STI);
|
2019-07-10 04:22:36 +08:00
|
|
|
MCSymbol *ReturnSym = OutContext.createTempSymbol();
|
|
|
|
OutStreamer->EmitLabel(ReturnSym);
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
|
2019-09-27 09:02:10 +08:00
|
|
|
OutStreamer->EmitLabel(HandleMismatchOrPartialSym);
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
|
2019-09-27 09:02:10 +08:00
|
|
|
if (IsShort) {
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
|
|
|
|
.addReg(AArch64::WZR)
|
|
|
|
.addReg(AArch64::W16)
|
|
|
|
.addImm(15)
|
2019-07-10 04:22:36 +08:00
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
2019-09-27 09:02:10 +08:00
|
|
|
MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::Bcc)
|
|
|
|
.addImm(AArch64CC::HI)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
|
|
|
|
*STI);
|
|
|
|
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::ANDXri)
|
|
|
|
.addReg(AArch64::X17)
|
|
|
|
.addReg(Reg)
|
|
|
|
.addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
|
|
|
|
*STI);
|
|
|
|
unsigned Size = 1 << (AccessInfo & 0xf);
|
|
|
|
if (Size != 1)
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
|
|
|
|
.addReg(AArch64::X17)
|
|
|
|
.addReg(AArch64::X17)
|
|
|
|
.addImm(Size - 1)
|
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
|
|
|
|
.addReg(AArch64::WZR)
|
|
|
|
.addReg(AArch64::W16)
|
|
|
|
.addReg(AArch64::W17)
|
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::Bcc)
|
|
|
|
.addImm(AArch64CC::LS)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
|
|
|
|
*STI);
|
|
|
|
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::ORRXri)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addReg(Reg)
|
|
|
|
.addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
|
|
|
|
.addReg(AArch64::W16)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::SUBSXrs)
|
|
|
|
.addReg(AArch64::XZR)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addReg(Reg)
|
|
|
|
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::Bcc)
|
|
|
|
.addImm(AArch64CC::EQ)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
|
|
|
|
*STI);
|
|
|
|
|
|
|
|
OutStreamer->EmitLabel(HandleMismatchSym);
|
|
|
|
}
|
2019-07-10 04:22:36 +08:00
|
|
|
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
|
|
|
|
.addReg(AArch64::SP)
|
|
|
|
.addReg(AArch64::X0)
|
|
|
|
.addReg(AArch64::X1)
|
|
|
|
.addReg(AArch64::SP)
|
|
|
|
.addImm(-32),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi)
|
|
|
|
.addReg(AArch64::FP)
|
|
|
|
.addReg(AArch64::LR)
|
|
|
|
.addReg(AArch64::SP)
|
|
|
|
.addImm(29),
|
|
|
|
*STI);
|
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
if (Reg != AArch64::X0)
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs)
|
|
|
|
.addReg(AArch64::X0)
|
|
|
|
.addReg(AArch64::XZR)
|
|
|
|
.addReg(Reg)
|
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi)
|
|
|
|
.addReg(AArch64::X1)
|
|
|
|
.addImm(AccessInfo)
|
|
|
|
.addImm(0),
|
|
|
|
*STI);
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
|
|
|
|
// Intentionally load the GOT entry and branch to it, rather than possibly
|
|
|
|
// late binding the function, which may clobber the registers before we have
|
|
|
|
// a chance to save them.
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
OutStreamer->EmitInstruction(
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
MCInstBuilder(AArch64::ADRP)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addExpr(AArch64MCExpr::create(
|
2019-09-27 09:02:10 +08:00
|
|
|
HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
|
|
|
|
OutContext)),
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
*STI);
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::LDRXui)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addReg(AArch64::X16)
|
|
|
|
.addExpr(AArch64MCExpr::create(
|
2019-09-27 09:02:10 +08:00
|
|
|
HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
|
|
|
|
OutContext)),
|
[HWASan] Save + print registers when tag mismatch occurs in AArch64.
Summary:
This change change the instrumentation to allow users to view the registers at the point at which tag mismatch occured. Most of the heavy lifting is done in the runtime library, where we save the registers to the stack and emit unwind information. This allows us to reduce the overhead, as very little additional work needs to be done in each __hwasan_check instance.
In this implementation, the fast path of __hwasan_check is unmodified. There are an additional 4 instructions (16B) emitted in the slow path in every __hwasan_check instance. This may increase binary size somewhat, but as most of the work is done in the runtime library, it's manageable.
The failure trace now contains a list of registers at the point of which the failure occured, in a format similar to that of Android's tombstones. It currently has the following format:
Registers where the failure occurred (pc 0x0055555561b4):
x0 0000000000000014 x1 0000007ffffff6c0 x2 1100007ffffff6d0 x3 12000056ffffe025
x4 0000007fff800000 x5 0000000000000014 x6 0000007fff800000 x7 0000000000000001
x8 12000056ffffe020 x9 0200007700000000 x10 0200007700000000 x11 0000000000000000
x12 0000007fffffdde0 x13 0000000000000000 x14 02b65b01f7a97490 x15 0000000000000000
x16 0000007fb77376b8 x17 0000000000000012 x18 0000007fb7ed6000 x19 0000005555556078
x20 0000007ffffff768 x21 0000007ffffff778 x22 0000000000000001 x23 0000000000000000
x24 0000000000000000 x25 0000000000000000 x26 0000000000000000 x27 0000000000000000
x28 0000000000000000 x29 0000007ffffff6f0 x30 00000055555561b4
... and prints after the dump of memory tags around the buggy address.
Every register is saved exactly as it was at the point where the tag mismatch occurs, with the exception of x16/x17. These registers are used in the tag mismatch calculation as scratch registers during __hwasan_check, and cannot be saved without affecting the fast path. As these registers are designated as scratch registers for linking, there should be no important information in them that could aid in debugging.
Reviewers: pcc, eugenis
Reviewed By: pcc, eugenis
Subscribers: srhines, kubamracek, mgorny, javed.absar, krytarowski, kristof.beyls, hiraditya, jdoerfert, llvm-commits, #sanitizers
Tags: #sanitizers, #llvm
Differential Revision: https://reviews.llvm.org/D58857
llvm-svn: 355738
2019-03-09 05:22:35 +08:00
|
|
|
*STI);
|
|
|
|
OutStreamer->EmitInstruction(
|
|
|
|
MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
EmitHwasanMemaccessSymbols(M);
|
|
|
|
|
2015-06-16 23:44:21 +08:00
|
|
|
const Triple &TT = TM.getTargetTriple();
|
2015-02-03 14:40:19 +08:00
|
|
|
if (TT.isOSBinFormatMachO()) {
|
2014-04-18 22:54:41 +08:00
|
|
|
// Funny Darwin hack: This flag tells the linker that no global symbols
|
|
|
|
// contain code that falls through to other global symbols (e.g. the obvious
|
|
|
|
// implementation of multiple entry points). If this doesn't occur, the
|
|
|
|
// linker can safely perform dead code stripping. Since LLVM never
|
|
|
|
// generates code that does this, it is always safe to set.
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
|
2018-11-27 02:43:48 +08:00
|
|
|
emitStackMaps(SM);
|
2014-04-18 22:54:41 +08:00
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::EmitLOHs() {
|
2014-03-29 18:18:08 +08:00
|
|
|
SmallVector<MCSymbol *, 3> MCArgs;
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
for (const auto &D : AArch64FI->getLOHContainer()) {
|
2014-03-30 03:21:20 +08:00
|
|
|
for (const MachineInstr *MI : D.getArgs()) {
|
|
|
|
MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(LabelIt != LOHInstToLabel.end() &&
|
|
|
|
"Label hasn't been inserted for LOH related instruction");
|
|
|
|
MCArgs.push_back(LabelIt->second);
|
|
|
|
}
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLOHDirective(D.getKind(), MCArgs);
|
2014-03-29 18:18:08 +08:00
|
|
|
MCArgs.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::EmitFunctionBodyEnd() {
|
|
|
|
if (!AArch64FI->getLOHRelated().empty())
|
2014-03-29 18:18:08 +08:00
|
|
|
EmitLOHs();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// GetCPISymbol - Return the symbol for the specified constant pool entry.
|
2014-05-24 20:50:23 +08:00
|
|
|
MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Darwin uses a linker-private symbol name for constant-pools (to
|
|
|
|
// avoid addends on the relocation?), ELF has no such concept and
|
|
|
|
// uses a normal private symbol.
|
2016-10-01 13:57:55 +08:00
|
|
|
if (!getDataLayout().getLinkerPrivateGlobalPrefix().empty())
|
2015-05-19 02:43:14 +08:00
|
|
|
return OutContext.getOrCreateSymbol(
|
2014-03-29 18:18:08 +08:00
|
|
|
Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
|
|
|
|
Twine(getFunctionNumber()) + "_" + Twine(CPID));
|
|
|
|
|
2018-07-26 02:35:31 +08:00
|
|
|
return AsmPrinter::GetCPISymbol(CPID);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
|
|
|
|
raw_ostream &O) {
|
2014-03-29 18:18:08 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(OpNum);
|
|
|
|
switch (MO.getType()) {
|
|
|
|
default:
|
2014-06-18 13:05:13 +08:00
|
|
|
llvm_unreachable("<unknown operand type>");
|
2014-03-29 18:18:08 +08:00
|
|
|
case MachineOperand::MO_Register: {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register Reg = MO.getReg();
|
2019-08-02 07:27:28 +08:00
|
|
|
assert(Register::isPhysicalRegister(Reg));
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(!MO.getSubReg() && "Subregs should be eliminated!");
|
2014-05-24 20:50:23 +08:00
|
|
|
O << AArch64InstPrinter::getRegisterName(Reg);
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MachineOperand::MO_Immediate: {
|
2019-08-09 01:50:39 +08:00
|
|
|
O << MO.getImm();
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
|
|
|
}
|
2015-03-06 04:04:21 +08:00
|
|
|
case MachineOperand::MO_GlobalAddress: {
|
[AsmPrinter] refactor to support %c w/ GlobalAddress'
Summary:
Targets like ARM, MSP430, PPC, and SystemZ have complex behavior when
printing the address of a MachineOperand::MO_GlobalAddress. Move that
handling into a new overriden method in each base class. A virtual
method was added to the base class for handling the generic case.
Refactors a few subclasses to support the target independent %a, %c, and
%n.
The patch also contains small cleanups for AVRAsmPrinter and
SystemZAsmPrinter.
It seems that NVPTXTargetLowering is possibly missing some logic to
transform GlobalAddressSDNodes for
TargetLowering::LowerAsmOperandForConstraint to handle with "i" extended
inline assembly asm constraints.
Fixes:
- https://bugs.llvm.org/show_bug.cgi?id=41402
- https://github.com/ClangBuiltLinux/linux/issues/449
Reviewers: echristo, void
Reviewed By: void
Subscribers: void, craig.topper, jholewinski, dschuff, jyknight, dylanmckay, sdardis, nemanjai, javed.absar, sbc100, jgravelle-google, eraman, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, jrtc27, atanasyan, jsji, llvm-commits, kees, tpimh, nathanchance, peter.smith, srhines
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60887
llvm-svn: 359337
2019-04-27 02:45:04 +08:00
|
|
|
PrintSymbolOperand(MO, O);
|
2015-03-06 04:04:21 +08:00
|
|
|
break;
|
|
|
|
}
|
2018-05-16 17:33:25 +08:00
|
|
|
case MachineOperand::MO_BlockAddress: {
|
|
|
|
MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
|
|
|
|
Sym->print(O, MAI);
|
|
|
|
break;
|
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
|
|
|
|
raw_ostream &O) {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register Reg = MO.getReg();
|
2014-03-29 18:18:08 +08:00
|
|
|
switch (Mode) {
|
|
|
|
default:
|
|
|
|
return true; // Unknown mode.
|
|
|
|
case 'w':
|
|
|
|
Reg = getWRegFromXReg(Reg);
|
|
|
|
break;
|
|
|
|
case 'x':
|
|
|
|
Reg = getXRegFromWReg(Reg);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
O << AArch64InstPrinter::getRegisterName(Reg);
|
2014-03-29 18:18:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Prints the register in MO using class RC using the offset in the
|
|
|
|
// new register class. This should not be used for cross class
|
|
|
|
// printing.
|
2014-05-24 20:50:23 +08:00
|
|
|
bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
|
|
|
|
const TargetRegisterClass *RC,
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
unsigned AltName, raw_ostream &O) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(MO.isReg() && "Should only get here with a register!");
|
2016-07-07 05:39:33 +08:00
|
|
|
const TargetRegisterInfo *RI = STI->getRegisterInfo();
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register Reg = MO.getReg();
|
2014-03-29 18:18:08 +08:00
|
|
|
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
|
|
|
|
assert(RI->regsOverlap(RegToPrint, Reg));
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName);
|
2014-03-29 18:18:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
|
|
|
|
const char *ExtraCode, raw_ostream &O) {
|
2014-03-29 18:18:08 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(OpNum);
|
2014-05-27 15:37:21 +08:00
|
|
|
|
|
|
|
// First try the generic code, which knows about modifiers like 'c' and 'n'.
|
[AsmPrinter] refactor to remove remove AsmVariant. NFC
Summary:
The InlineAsm::AsmDialect is only required for X86; no architecture
makes use of it and as such it gets passed around between arch-specific
and general code while being unused for all architectures but X86.
Since the AsmDialect is queried from a MachineInstr, which we also pass
around, remove the additional AsmDialect parameter and query for it deep
in the X86AsmPrinter only when needed/as late as possible.
This refactor should help later planned refactors to AsmPrinter, as this
difference in the X86AsmPrinter makes it harder to make AsmPrinter more
generic.
Reviewers: craig.topper
Subscribers: jholewinski, arsenm, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, eraman, hiraditya, aheejin, kbarton, fedor.sergeev, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, llvm-commits, peter.smith, srhines
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60488
llvm-svn: 358101
2019-04-11 00:38:43 +08:00
|
|
|
if (!AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O))
|
2014-05-27 15:37:21 +08:00
|
|
|
return false;
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// Does this asm operand have a single letter operand modifier?
|
|
|
|
if (ExtraCode && ExtraCode[0]) {
|
|
|
|
if (ExtraCode[1] != 0)
|
|
|
|
return true; // Unknown modifier.
|
|
|
|
|
|
|
|
switch (ExtraCode[0]) {
|
|
|
|
default:
|
|
|
|
return true; // Unknown modifier.
|
|
|
|
case 'w': // Print W register
|
|
|
|
case 'x': // Print X register
|
|
|
|
if (MO.isReg())
|
|
|
|
return printAsmMRegister(MO, ExtraCode[0], O);
|
|
|
|
if (MO.isImm() && MO.getImm() == 0) {
|
2014-05-24 20:50:23 +08:00
|
|
|
unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
|
|
|
|
O << AArch64InstPrinter::getRegisterName(Reg);
|
2014-03-29 18:18:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
printOperand(MI, OpNum, O);
|
|
|
|
return false;
|
|
|
|
case 'b': // Print B register.
|
|
|
|
case 'h': // Print H register.
|
|
|
|
case 's': // Print S register.
|
|
|
|
case 'd': // Print D register.
|
|
|
|
case 'q': // Print Q register.
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
case 'z': // Print Z register.
|
2014-03-29 18:18:08 +08:00
|
|
|
if (MO.isReg()) {
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
switch (ExtraCode[0]) {
|
|
|
|
case 'b':
|
2014-05-24 20:50:23 +08:00
|
|
|
RC = &AArch64::FPR8RegClass;
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
|
|
|
case 'h':
|
2014-05-24 20:50:23 +08:00
|
|
|
RC = &AArch64::FPR16RegClass;
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
|
|
|
case 's':
|
2014-05-24 20:50:23 +08:00
|
|
|
RC = &AArch64::FPR32RegClass;
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
|
|
|
case 'd':
|
2014-05-24 20:50:23 +08:00
|
|
|
RC = &AArch64::FPR64RegClass;
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
|
|
|
case 'q':
|
2014-05-24 20:50:23 +08:00
|
|
|
RC = &AArch64::FPR128RegClass;
|
2014-03-29 18:18:08 +08:00
|
|
|
break;
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
case 'z':
|
|
|
|
RC = &AArch64::ZPRRegClass;
|
|
|
|
break;
|
2014-03-29 18:18:08 +08:00
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
return printAsmRegInClass(MO, RC, AArch64::NoRegAltName, O);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
printOperand(MI, OpNum, O);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// According to ARM, we should emit x and v registers unless we have a
|
|
|
|
// modifier.
|
|
|
|
if (MO.isReg()) {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register Reg = MO.getReg();
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
// If this is a w or x register, print an x register.
|
2014-05-24 20:50:23 +08:00
|
|
|
if (AArch64::GPR32allRegClass.contains(Reg) ||
|
|
|
|
AArch64::GPR64allRegClass.contains(Reg))
|
2014-03-29 18:18:08 +08:00
|
|
|
return printAsmMRegister(MO, 'x', O);
|
|
|
|
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
unsigned AltName = AArch64::NoRegAltName;
|
|
|
|
const TargetRegisterClass *RegClass;
|
|
|
|
if (AArch64::ZPRRegClass.contains(Reg)) {
|
|
|
|
RegClass = &AArch64::ZPRRegClass;
|
[SVE][Inline-Asm] Add constraints for SVE predicate registers
Summary:
Adds the following inline asm constraints for SVE:
- Upl: One of the low eight SVE predicate registers, P0 to P7 inclusive
- Upa: SVE predicate register with full range, P0 to P15
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, cameron.mcinally, greened, rengolin
Reviewed By: rovka
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66524
llvm-svn: 371967
2019-09-16 17:45:27 +08:00
|
|
|
} else if (AArch64::PPRRegClass.contains(Reg)) {
|
|
|
|
RegClass = &AArch64::PPRRegClass;
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
} else {
|
|
|
|
RegClass = &AArch64::FPR128RegClass;
|
|
|
|
AltName = AArch64::vreg;
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// If this is a b, h, s, d, or q register, print it as a v register.
|
[SVE][Inline-Asm] Support for SVE asm operands
Summary:
Adds the following inline asm constraints for SVE:
- w: SVE vector register with full range, Z0 to Z31
- x: Restricted to registers Z0 to Z15 inclusive.
- y: Restricted to registers Z0 to Z7 inclusive.
This change also adds the "z" modifier to interpret a register as an SVE register.
Not all of the bitconvert patterns added by this patch are used, but they have been included here for completeness.
Reviewers: t.p.northover, sdesmalen, rovka, momchil.velikov, rengolin, cameron.mcinally, greened
Reviewed By: sdesmalen
Subscribers: javed.absar, tschuett, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66302
llvm-svn: 370673
2019-09-03 00:12:31 +08:00
|
|
|
return printAsmRegInClass(MO, RegClass, AltName, O);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
printOperand(MI, OpNum, O);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
|
|
|
|
unsigned OpNum,
|
|
|
|
const char *ExtraCode,
|
|
|
|
raw_ostream &O) {
|
2017-05-26 03:07:57 +08:00
|
|
|
if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
|
2014-03-29 18:18:08 +08:00
|
|
|
return true; // Unknown modifier.
|
|
|
|
|
|
|
|
const MachineOperand &MO = MI->getOperand(OpNum);
|
|
|
|
assert(MO.isReg() && "unexpected inline asm memory operand");
|
2014-05-24 20:50:23 +08:00
|
|
|
O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
|
2014-03-29 18:18:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
|
|
|
|
raw_ostream &OS) {
|
2014-03-29 18:18:08 +08:00
|
|
|
unsigned NOps = MI->getNumOperands();
|
|
|
|
assert(NOps == 4);
|
|
|
|
OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
|
|
|
|
// cast away const; DIetc do not take const operands for some reason.
|
2015-04-30 00:38:44 +08:00
|
|
|
OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata())
|
2015-04-14 10:22:36 +08:00
|
|
|
->getName();
|
2014-03-29 18:18:08 +08:00
|
|
|
OS << " <- ";
|
|
|
|
// Frame address. Currently handles register +- offset only.
|
|
|
|
assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
|
|
|
|
OS << '[';
|
|
|
|
printOperand(MI, 0, OS);
|
|
|
|
OS << '+';
|
|
|
|
printOperand(MI, 1, OS);
|
|
|
|
OS << ']';
|
|
|
|
OS << "+";
|
|
|
|
printOperand(MI, NOps - 2, OS);
|
|
|
|
}
|
|
|
|
|
2018-10-25 04:19:09 +08:00
|
|
|
void AArch64AsmPrinter::EmitJumpTableInfo() {
|
|
|
|
const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
|
|
|
|
if (!MJTI) return;
|
|
|
|
|
|
|
|
const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
|
|
|
|
if (JT.empty()) return;
|
|
|
|
|
2019-01-29 17:36:48 +08:00
|
|
|
const Function &F = MF->getFunction();
|
2018-10-25 04:19:09 +08:00
|
|
|
const TargetLoweringObjectFile &TLOF = getObjFileLowering();
|
2019-01-29 17:36:48 +08:00
|
|
|
bool JTInDiffSection =
|
|
|
|
!STI->isTargetCOFF() ||
|
|
|
|
!TLOF.shouldPutJumpTableInFunctionSection(
|
|
|
|
MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
|
|
|
|
F);
|
|
|
|
if (JTInDiffSection) {
|
|
|
|
// Drop it in the readonly section.
|
|
|
|
MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM);
|
|
|
|
OutStreamer->SwitchSection(ReadOnlySec);
|
|
|
|
}
|
2018-10-25 04:19:09 +08:00
|
|
|
|
|
|
|
auto AFI = MF->getInfo<AArch64FunctionInfo>();
|
|
|
|
for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
|
|
|
|
const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
|
|
|
|
|
|
|
|
// If this jump table was deleted, ignore it.
|
|
|
|
if (JTBBs.empty()) continue;
|
|
|
|
|
|
|
|
unsigned Size = AFI->getJumpTableEntrySize(JTI);
|
2019-09-11 21:37:35 +08:00
|
|
|
EmitAlignment(llvm::Align(Size));
|
2018-10-25 04:19:09 +08:00
|
|
|
OutStreamer->EmitLabel(GetJTISymbol(JTI));
|
|
|
|
|
|
|
|
for (auto *JTBB : JTBBs)
|
|
|
|
emitJumpTableEntry(MJTI, JTBB, JTI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
|
|
|
|
const MachineBasicBlock *MBB,
|
|
|
|
unsigned JTI) {
|
|
|
|
const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
|
|
|
|
auto AFI = MF->getInfo<AArch64FunctionInfo>();
|
|
|
|
unsigned Size = AFI->getJumpTableEntrySize(JTI);
|
|
|
|
|
|
|
|
if (Size == 4) {
|
|
|
|
// .word LBB - LJTI
|
|
|
|
const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
|
|
|
|
const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext);
|
|
|
|
Value = MCBinaryExpr::createSub(Value, Base, OutContext);
|
|
|
|
} else {
|
|
|
|
// .byte (LBB - LBB) >> 2 (or .hword)
|
|
|
|
const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI);
|
|
|
|
const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
|
|
|
|
Value = MCBinaryExpr::createSub(Value, Base, OutContext);
|
|
|
|
Value = MCBinaryExpr::createLShr(
|
|
|
|
Value, MCConstantExpr::create(2, OutContext), OutContext);
|
|
|
|
}
|
|
|
|
|
|
|
|
OutStreamer->EmitValue(Value, Size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Small jump tables contain an unsigned byte or half, representing the offset
|
|
|
|
/// from the lowest-addressed possible destination to the desired basic
|
|
|
|
/// block. Since all instructions are 4-byte aligned, this is further compressed
|
|
|
|
/// by counting in instructions rather than bytes (i.e. divided by 4). So, to
|
|
|
|
/// materialize the correct destination we need:
|
|
|
|
///
|
|
|
|
/// adr xDest, .LBB0_0
|
|
|
|
/// ldrb wScratch, [xTable, xEntry] (with "lsl #1" for ldrh).
|
|
|
|
/// add xDest, xDest, xScratch, lsl #2
|
|
|
|
void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
|
|
|
|
const llvm::MachineInstr &MI) {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register DestReg = MI.getOperand(0).getReg();
|
|
|
|
Register ScratchReg = MI.getOperand(1).getReg();
|
|
|
|
Register ScratchRegW =
|
2018-10-25 04:19:09 +08:00
|
|
|
STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register TableReg = MI.getOperand(2).getReg();
|
|
|
|
Register EntryReg = MI.getOperand(3).getReg();
|
2018-10-25 04:19:09 +08:00
|
|
|
int JTIdx = MI.getOperand(4).getIndex();
|
|
|
|
bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
|
|
|
|
|
|
|
|
// This has to be first because the compression pass based its reachability
|
|
|
|
// calculations on the start of the JumpTableDest instruction.
|
|
|
|
auto Label =
|
|
|
|
MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx);
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR)
|
|
|
|
.addReg(DestReg)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(
|
|
|
|
Label, MF->getContext())));
|
|
|
|
|
|
|
|
// Load the number of instruction-steps to offset from the label.
|
|
|
|
unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX;
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode)
|
|
|
|
.addReg(ScratchRegW)
|
|
|
|
.addReg(TableReg)
|
|
|
|
.addReg(EntryReg)
|
|
|
|
.addImm(0)
|
|
|
|
.addImm(IsByteEntry ? 0 : 1));
|
|
|
|
|
|
|
|
// Multiply the steps by 4 and add to the already materialized base label
|
|
|
|
// address.
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs)
|
|
|
|
.addReg(DestReg)
|
|
|
|
.addReg(DestReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm(2));
|
|
|
|
}
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
|
|
|
|
const MachineInstr &MI) {
|
2016-08-31 20:43:49 +08:00
|
|
|
unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
SM.recordStackMap(MI);
|
|
|
|
assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
|
2014-12-03 05:36:24 +08:00
|
|
|
|
|
|
|
// Scan ahead to trim the shadow.
|
|
|
|
const MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
MachineBasicBlock::const_iterator MII(MI);
|
|
|
|
++MII;
|
|
|
|
while (NumNOPBytes > 0) {
|
|
|
|
if (MII == MBB.end() || MII->isCall() ||
|
|
|
|
MII->getOpcode() == AArch64::DBG_VALUE ||
|
|
|
|
MII->getOpcode() == TargetOpcode::PATCHPOINT ||
|
|
|
|
MII->getOpcode() == TargetOpcode::STACKMAP)
|
|
|
|
break;
|
|
|
|
++MII;
|
|
|
|
NumNOPBytes -= 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit nops.
|
2014-03-29 18:18:08 +08:00
|
|
|
for (unsigned i = 0; i < NumNOPBytes; i += 4)
|
2014-05-24 20:50:23 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Lower a patchpoint of the form:
|
|
|
|
// [<def>], <id>, <numBytes>, <target>, <numArgs>
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
|
|
|
|
const MachineInstr &MI) {
|
2014-03-29 18:18:08 +08:00
|
|
|
SM.recordPatchPoint(MI);
|
|
|
|
|
|
|
|
PatchPointOpers Opers(&MI);
|
|
|
|
|
2016-08-24 07:33:29 +08:00
|
|
|
int64_t CallTarget = Opers.getCallTarget().getImm();
|
2014-03-29 18:18:08 +08:00
|
|
|
unsigned EncodedBytes = 0;
|
|
|
|
if (CallTarget) {
|
|
|
|
assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
|
|
|
|
"High 16 bits of call target should be zero.");
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
|
2014-03-29 18:18:08 +08:00
|
|
|
EncodedBytes = 16;
|
|
|
|
// Materialize the jump address:
|
2016-06-16 04:33:36 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
|
2014-03-29 18:18:08 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm((CallTarget >> 32) & 0xFFFF)
|
|
|
|
.addImm(32));
|
2016-06-16 04:33:36 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
|
2014-03-29 18:18:08 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm((CallTarget >> 16) & 0xFFFF)
|
|
|
|
.addImm(16));
|
2016-06-16 04:33:36 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
|
2014-03-29 18:18:08 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm(CallTarget & 0xFFFF)
|
|
|
|
.addImm(0));
|
2014-05-24 20:50:23 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
// Emit padding.
|
2016-08-24 07:33:29 +08:00
|
|
|
unsigned NumBytes = Opers.getNumPatchBytes();
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(NumBytes >= EncodedBytes &&
|
|
|
|
"Patchpoint can't request size less than the length of a call.");
|
|
|
|
assert((NumBytes - EncodedBytes) % 4 == 0 &&
|
|
|
|
"Invalid number of NOP bytes requested!");
|
|
|
|
for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
|
2014-05-24 20:50:23 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2016-07-07 05:39:33 +08:00
|
|
|
void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register DestReg = MI.getOperand(0).getReg();
|
2018-09-29 03:05:09 +08:00
|
|
|
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
|
2017-08-24 22:47:06 +08:00
|
|
|
// Convert H/S/D register to corresponding Q register
|
|
|
|
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
|
|
|
|
DestReg = AArch64::Q0 + (DestReg - AArch64::H0);
|
|
|
|
else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
|
2016-07-07 05:39:33 +08:00
|
|
|
DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
|
2017-08-24 22:47:06 +08:00
|
|
|
else {
|
2016-07-07 05:39:33 +08:00
|
|
|
assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
|
|
|
|
DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
|
|
|
|
}
|
|
|
|
MCInst MOVI;
|
|
|
|
MOVI.setOpcode(AArch64::MOVIv2d_ns);
|
|
|
|
MOVI.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
MOVI.addOperand(MCOperand::createImm(0));
|
|
|
|
EmitToStreamer(*OutStreamer, MOVI);
|
|
|
|
} else {
|
|
|
|
MCInst FMov;
|
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
default: llvm_unreachable("Unexpected opcode");
|
2017-08-24 22:47:06 +08:00
|
|
|
case AArch64::FMOVH0:
|
|
|
|
FMov.setOpcode(AArch64::FMOVWHr);
|
|
|
|
FMov.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
|
|
|
|
break;
|
2016-07-07 05:39:33 +08:00
|
|
|
case AArch64::FMOVS0:
|
|
|
|
FMov.setOpcode(AArch64::FMOVWSr);
|
|
|
|
FMov.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
|
|
|
|
break;
|
|
|
|
case AArch64::FMOVD0:
|
|
|
|
FMov.setOpcode(AArch64::FMOVXDr);
|
|
|
|
FMov.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
EmitToStreamer(*OutStreamer, FMov);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// Simple pseudo-instructions have their lowering (with expansion to real
|
|
|
|
// instructions) auto-generated.
|
2014-05-24 20:50:23 +08:00
|
|
|
#include "AArch64GenMCPseudoLowering.inc"
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Do any auto-generated pseudo lowerings.
|
2015-04-25 03:11:51 +08:00
|
|
|
if (emitPseudoExpansionLowering(*OutStreamer, MI))
|
2014-03-29 18:18:08 +08:00
|
|
|
return;
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
if (AArch64FI->getLOHRelated().count(MI)) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Generate a label for LOH related instruction
|
2015-03-18 04:07:06 +08:00
|
|
|
MCSymbol *LOHLabel = createTempSymbol("loh");
|
2014-03-29 18:18:08 +08:00
|
|
|
// Associate the instruction with the label
|
|
|
|
LOHInstToLabel[MI] = LOHLabel;
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(LOHLabel);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2018-10-27 14:13:06 +08:00
|
|
|
AArch64TargetStreamer *TS =
|
|
|
|
static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
|
2014-03-29 18:18:08 +08:00
|
|
|
// Do any manual lowerings.
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default:
|
|
|
|
break;
|
[COFF, ARM64] Implement support for SEH extensions __try/__except/__finally
Summary:
This patch supports MS SEH extensions __try/__except/__finally. The intrinsics localescape and localrecover are responsible for communicating escaped static allocas from the try block to the handler.
We need to preserve frame pointers for SEH. So we create a new function/property HasLocalEscape.
Reviewers: rnk, compnerd, mstorsjo, TomTan, efriedma, ssijaric
Reviewed By: rnk, efriedma
Subscribers: smeenai, jrmuizel, alex, majnemer, ssijaric, ehsan, dmajor, kristina, javed.absar, kristof.beyls, chrib, llvm-commits
Differential Revision: https://reviews.llvm.org/D53540
llvm-svn: 351370
2019-01-17 03:52:59 +08:00
|
|
|
case AArch64::MOVMCSym: {
|
[aarch64] Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Manual fixups in:
AArch64InstrInfo.cpp - genFusedMultiply() now takes a Register* instead of unsigned*
AArch64LoadStoreOptimizer.cpp - Ternary operator was ambiguous between Register/MCRegister. Settled on Register
Depends on D65919
Reviewers: aemerson
Subscribers: jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision for full review was: https://reviews.llvm.org/D65962
llvm-svn: 368628
2019-08-13 06:40:53 +08:00
|
|
|
Register DestReg = MI->getOperand(0).getReg();
|
|
|
|
const MachineOperand &MO_Sym = MI->getOperand(1);
|
|
|
|
MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
|
|
|
|
MCOperand Hi_MCSym, Lo_MCSym;
|
|
|
|
|
|
|
|
Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
|
|
|
|
Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
|
|
|
|
|
|
|
|
MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
|
|
|
|
MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
|
|
|
|
|
|
|
|
MCInst MovZ;
|
|
|
|
MovZ.setOpcode(AArch64::MOVZXi);
|
|
|
|
MovZ.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
MovZ.addOperand(Hi_MCSym);
|
|
|
|
MovZ.addOperand(MCOperand::createImm(16));
|
|
|
|
EmitToStreamer(*OutStreamer, MovZ);
|
|
|
|
|
|
|
|
MCInst MovK;
|
|
|
|
MovK.setOpcode(AArch64::MOVKXi);
|
|
|
|
MovK.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
MovK.addOperand(MCOperand::createReg(DestReg));
|
|
|
|
MovK.addOperand(Lo_MCSym);
|
|
|
|
MovK.addOperand(MCOperand::createImm(0));
|
|
|
|
EmitToStreamer(*OutStreamer, MovK);
|
|
|
|
return;
|
[COFF, ARM64] Implement support for SEH extensions __try/__except/__finally
Summary:
This patch supports MS SEH extensions __try/__except/__finally. The intrinsics localescape and localrecover are responsible for communicating escaped static allocas from the try block to the handler.
We need to preserve frame pointers for SEH. So we create a new function/property HasLocalEscape.
Reviewers: rnk, compnerd, mstorsjo, TomTan, efriedma, ssijaric
Reviewed By: rnk, efriedma
Subscribers: smeenai, jrmuizel, alex, majnemer, ssijaric, ehsan, dmajor, kristina, javed.absar, kristof.beyls, chrib, llvm-commits
Differential Revision: https://reviews.llvm.org/D53540
llvm-svn: 351370
2019-01-17 03:52:59 +08:00
|
|
|
}
|
2017-12-20 18:45:39 +08:00
|
|
|
case AArch64::MOVIv2d_ns:
|
|
|
|
// If the target has <rdar://problem/16473581>, lower this
|
|
|
|
// instruction to movi.16b instead.
|
|
|
|
if (STI->hasZeroCycleZeroingFPWorkaround() &&
|
|
|
|
MI->getOperand(1).getImm() == 0) {
|
|
|
|
MCInst TmpInst;
|
|
|
|
TmpInst.setOpcode(AArch64::MOVIv16b_ns);
|
|
|
|
TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
|
|
|
|
TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
|
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
case AArch64::DBG_VALUE: {
|
2015-04-25 03:11:51 +08:00
|
|
|
if (isVerbose() && OutStreamer->hasRawTextSupport()) {
|
2014-03-29 18:18:08 +08:00
|
|
|
SmallString<128> TmpStr;
|
|
|
|
raw_svector_ostream OS(TmpStr);
|
|
|
|
PrintDebugValueComment(MI, OS);
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitRawText(StringRef(OS.str()));
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
return;
|
2018-12-21 18:45:08 +08:00
|
|
|
|
|
|
|
case AArch64::EMITBKEY: {
|
|
|
|
ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
|
|
|
|
if (ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
|
|
|
|
ExceptionHandlingType != ExceptionHandling::ARM)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (needsCFIMoves() == CFI_M_None)
|
|
|
|
return;
|
|
|
|
|
|
|
|
OutStreamer->EmitCFIBKeyFrame();
|
|
|
|
return;
|
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tail calls use pseudo instructions so they have the proper code-gen
|
|
|
|
// attributes (isCall, isReturn, etc.). We lower them to the real
|
|
|
|
// instruction here.
|
2018-10-08 17:18:48 +08:00
|
|
|
case AArch64::TCRETURNri:
|
[AArch64][v8.5A] Restrict indirect tail calls to use x16/17 only when using BTI
When branch target identification is enabled, all indirectly-callable
functions start with a BTI C instruction. this instruction can only be
the target of certain indirect branches (direct branches and
fall-through are not affected):
- A BLR instruction, in either a protected or unprotected page.
- A BR instruction in a protected page, using x16 or x17.
- A BR instruction in an unprotected page, using any register.
Without BTI, we can use any non call-preserved register to hold the
address for an indirect tail call. However, when BTI is enabled, then
the code being compiled might be loaded into a BTI-protected page, where
only x16 and x17 can be used for indirect tail calls.
Legacy code withiout this restriction can still indirectly tail-call
BTI-protected functions, because they will be loaded into an unprotected
page, so any register is allowed.
Differential revision: https://reviews.llvm.org/D52868
llvm-svn: 343968
2018-10-08 22:09:15 +08:00
|
|
|
case AArch64::TCRETURNriBTI:
|
2018-10-08 17:18:48 +08:00
|
|
|
case AArch64::TCRETURNriALL: {
|
2014-03-29 18:18:08 +08:00
|
|
|
MCInst TmpInst;
|
2014-05-24 20:50:23 +08:00
|
|
|
TmpInst.setOpcode(AArch64::BR);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2014-03-29 18:18:08 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-05-24 20:50:23 +08:00
|
|
|
case AArch64::TCRETURNdi: {
|
2014-03-29 18:18:08 +08:00
|
|
|
MCOperand Dest;
|
|
|
|
MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
|
|
|
|
MCInst TmpInst;
|
2014-05-24 20:50:23 +08:00
|
|
|
TmpInst.setOpcode(AArch64::B);
|
2014-03-29 18:18:08 +08:00
|
|
|
TmpInst.addOperand(Dest);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2014-03-29 18:18:08 +08:00
|
|
|
return;
|
|
|
|
}
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
case AArch64::TLSDESC_CALLSEQ: {
|
|
|
|
/// lower this to:
|
|
|
|
/// adrp x0, :tlsdesc:var
|
|
|
|
/// ldr x1, [x0, #:tlsdesc_lo12:var]
|
|
|
|
/// add x0, x0, #:tlsdesc_lo12:var
|
|
|
|
/// .tlsdesccall var
|
|
|
|
/// blr x1
|
|
|
|
/// (TPIDR_EL0 offset now in x0)
|
|
|
|
const MachineOperand &MO_Sym = MI->getOperand(0);
|
|
|
|
MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
|
|
|
|
MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
|
[AArch64] ILP32 Backend Relocation Support
Remove "_NC" suffix and semantics from TLSDESC_LD{64,32}_LO12 and
TLSDESC_ADD_LO12 relocations
Rearrange ordering in AArch64.def to follow relocation encoding
Fix name:
R_AARCH64_P32_LD64_GOT_LO12_NC => R_AARCH64_P32_LD32_GOT_LO12_NC
Add support for several "TLS", "TLSGD", and "TLSLD" relocations for
ILP32
Fix return values from isNonILP32reloc
Add implementations for
R_AARCH64_ADR_PREL_PG_HI21_NC, R_AARCH64_P32_LD32_GOT_LO12_NC,
R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC,
R_AARCH64_P32_TLSDESC_LD32_LO12, R_AARCH64_LD64_GOT_LO12_NC,
*TLSLD_LDST128_DTPREL_LO12, *TLSLD_LDST128_DTPREL_LO12_NC,
*TLSLE_LDST128_TPREL_LO12, *TLSLE_LDST128_TPREL_LO12_NC
Modify error messages to give name of equivalent relocation in the
ABI not being used, along with better checking for non-existent
requested relocations.
Added assembler support for "pg_hi21_nc"
Relocation definitions added without implementations:
R_AARCH64_P32_TLSDESC_ADR_PREL21, R_AARCH64_P32_TLSGD_ADR_PREL21,
R_AARCH64_P32_TLSGD_ADD_LO12_NC, R_AARCH64_P32_TLSLD_ADR_PREL21,
R_AARCH64_P32_TLSLD_ADR_PAGE21, R_AARCH64_P32_TLSLD_ADD_LO12_NC,
R_AARCH64_P32_TLSLD_LD_PREL19, R_AARCH64_P32_TLSDESC_LD_PREL19,
R_AARCH64_P32_TLSGD_ADR_PAGE21, R_AARCH64_P32_TLS_DTPREL,
R_AARCH64_P32_TLS_DTPMOD, R_AARCH64_P32_TLS_TPREL,
R_AARCH64_P32_TLSDESC
Fix encoding:
R_AARCH64_P32_TLSDESC_ADR_PAGE21
Reviewers: Peter Smith
Patch by: Joel Jones (jjones@cavium.com)
Differential Revision: https://reviews.llvm.org/D32072
llvm-svn: 301980
2017-05-03 06:01:48 +08:00
|
|
|
MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
|
|
|
|
MCInstLowering.lowerOperand(MO_Sym, Sym);
|
|
|
|
MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
|
|
|
|
MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
|
|
|
|
|
|
|
|
MCInst Adrp;
|
|
|
|
Adrp.setOpcode(AArch64::ADRP);
|
2015-05-14 02:37:00 +08:00
|
|
|
Adrp.addOperand(MCOperand::createReg(AArch64::X0));
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
Adrp.addOperand(SymTLSDesc);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, Adrp);
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
|
|
|
|
MCInst Ldr;
|
|
|
|
Ldr.setOpcode(AArch64::LDRXui);
|
2015-05-14 02:37:00 +08:00
|
|
|
Ldr.addOperand(MCOperand::createReg(AArch64::X1));
|
|
|
|
Ldr.addOperand(MCOperand::createReg(AArch64::X0));
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
Ldr.addOperand(SymTLSDescLo12);
|
2015-05-14 02:37:00 +08:00
|
|
|
Ldr.addOperand(MCOperand::createImm(0));
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, Ldr);
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
|
|
|
|
MCInst Add;
|
|
|
|
Add.setOpcode(AArch64::ADDXri);
|
2015-05-14 02:37:00 +08:00
|
|
|
Add.addOperand(MCOperand::createReg(AArch64::X0));
|
|
|
|
Add.addOperand(MCOperand::createReg(AArch64::X0));
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
Add.addOperand(SymTLSDescLo12);
|
2015-05-14 02:37:00 +08:00
|
|
|
Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, Add);
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
|
|
|
|
// Emit a relocation-annotation. This expands to no code, but requests
|
2014-03-29 18:18:08 +08:00
|
|
|
// the following instruction gets an R_AARCH64_TLSDESC_CALL.
|
|
|
|
MCInst TLSDescCall;
|
2014-05-24 20:50:23 +08:00
|
|
|
TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
|
2014-03-29 18:18:08 +08:00
|
|
|
TLSDescCall.addOperand(Sym);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TLSDescCall);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers cannot handle yet.
As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the GNU linkers
ld.bfd and ld.gold currently only support a subset of the whole range of AArch64
ELF TLS relocations. Furthermore, they assume that some of the code sequences to
access thread-local variables are produced in a very specific sequence.
When the sequence is not as the linker expects, it can silently mis-relaxe/mis-optimize
the instructions.
Even if that wouldn't be the case, it's good to produce the exact sequence,
as that ensures that linkers can perform optimizing relaxations.
This patch:
* implements support for 16MiB TLS area size instead of 4GiB TLS area size. Ideally clang
would grow an -mtls-size option to allow support for both, but that's not part of this patch.
* by default doesn't produce local dynamic access patterns, as even modern ld.bfd and ld.gold
linkers do not support the associated relocations. An option (-aarch64-elf-ldtls-generation)
is added to enable generation of local dynamic code sequence, but is off by default.
* makes sure that the exact expected code sequence for local dynamic and general dynamic
accesses is produced, by making use of a new pseudo instruction. The patch also removes
two (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing AArch64-specific pseudo
SDNode instructions that are superseded by the new one (TLSDESC_CALLSEQ).
llvm-svn: 231227
2015-03-04 17:12:08 +08:00
|
|
|
MCInst Blr;
|
|
|
|
Blr.setOpcode(AArch64::BLR);
|
2015-05-14 02:37:00 +08:00
|
|
|
Blr.addOperand(MCOperand::createReg(AArch64::X1));
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, Blr);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-10-25 04:19:09 +08:00
|
|
|
case AArch64::JumpTableDest32: {
|
|
|
|
// We want:
|
|
|
|
// ldrsw xScratch, [xTable, xEntry, lsl #2]
|
|
|
|
// add xDest, xTable, xScratch
|
|
|
|
unsigned DestReg = MI->getOperand(0).getReg(),
|
|
|
|
ScratchReg = MI->getOperand(1).getReg(),
|
|
|
|
TableReg = MI->getOperand(2).getReg(),
|
|
|
|
EntryReg = MI->getOperand(3).getReg();
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addReg(TableReg)
|
|
|
|
.addReg(EntryReg)
|
|
|
|
.addImm(0)
|
|
|
|
.addImm(1));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
|
|
|
|
.addReg(DestReg)
|
|
|
|
.addReg(TableReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm(0));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case AArch64::JumpTableDest16:
|
|
|
|
case AArch64::JumpTableDest8:
|
|
|
|
LowerJumpTableDestSmall(*OutStreamer, *MI);
|
|
|
|
return;
|
|
|
|
|
2017-08-24 22:47:06 +08:00
|
|
|
case AArch64::FMOVH0:
|
2016-07-07 05:39:33 +08:00
|
|
|
case AArch64::FMOVS0:
|
|
|
|
case AArch64::FMOVD0:
|
|
|
|
EmitFMov0(*MI);
|
|
|
|
return;
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
case TargetOpcode::STACKMAP:
|
2015-04-25 03:11:51 +08:00
|
|
|
return LowerSTACKMAP(*OutStreamer, SM, *MI);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
case TargetOpcode::PATCHPOINT:
|
2015-04-25 03:11:51 +08:00
|
|
|
return LowerPATCHPOINT(*OutStreamer, SM, *MI);
|
2016-11-17 13:15:37 +08:00
|
|
|
|
|
|
|
case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
|
|
|
|
LowerPATCHABLE_FUNCTION_ENTER(*MI);
|
|
|
|
return;
|
|
|
|
|
|
|
|
case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
|
|
|
|
LowerPATCHABLE_FUNCTION_EXIT(*MI);
|
|
|
|
return;
|
|
|
|
|
|
|
|
case TargetOpcode::PATCHABLE_TAIL_CALL:
|
|
|
|
LowerPATCHABLE_TAIL_CALL(*MI);
|
|
|
|
return;
|
2018-10-27 14:13:06 +08:00
|
|
|
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
case AArch64::HWASAN_CHECK_MEMACCESS:
|
2019-09-27 09:02:10 +08:00
|
|
|
case AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES:
|
hwasan: Move memory access checks into small outlined functions on aarch64.
Each hwasan check requires emitting a small piece of code like this:
https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html#memory-accesses
The problem with this is that these code blocks typically bloat code
size significantly.
An obvious solution is to outline these blocks of code. In fact, this
has already been implemented under the -hwasan-instrument-with-calls
flag. However, as currently implemented this has a number of problems:
- The functions use the same calling convention as regular C functions.
This means that the backend must spill all temporary registers as
required by the platform's C calling convention, even though the
check only needs two registers on the hot path.
- The functions take the address to be checked in a fixed register,
which increases register pressure.
Both of these factors can diminish the code size effect and increase
the performance hit of -hwasan-instrument-with-calls.
The solution that this patch implements is to involve the aarch64
backend in outlining the checks. An intrinsic and pseudo-instruction
are created to represent a hwasan check. The pseudo-instruction
is register allocated like any other instruction, and we allow the
register allocator to select almost any register for the address to
check. A particular combination of (register selection, type of check)
triggers the creation in the backend of a function to handle the check
for specifically that pair. The resulting functions are deduplicated by
the linker. The pseudo-instruction (really the function) is specified
to preserve all registers except for the registers that the AAPCS
specifies may be clobbered by a call.
To measure the code size and performance effect of this change, I
took a number of measurements using Chromium for Android on aarch64,
comparing a browser with inlined checks (the baseline) against a
browser with outlined checks.
Code size: Size of .text decreases from 243897420 to 171619972 bytes,
or a 30% decrease.
Performance: Using Chromium's blink_perf.layout microbenchmarks I
measured a median performance regression of 6.24%.
The fact that a perf/size tradeoff is evident here suggests that
we might want to make the new behaviour conditional on -Os/-Oz.
But for now I've enabled it unconditionally, my reasoning being that
hwasan users typically expect a relatively large perf hit, and ~6%
isn't really adding much. We may want to revisit this decision in
the future, though.
I also tried experimenting with varying the number of registers
selectable by the hwasan check pseudo-instruction (which would result
in fewer variants being created), on the hypothesis that creating
fewer variants of the function would expose another perf/size tradeoff
by reducing icache pressure from the check functions at the cost of
register pressure. Although I did observe a code size increase with
fewer registers, I did not observe a strong correlation between the
number of registers and the performance of the resulting browser on the
microbenchmarks, so I conclude that we might as well use ~all registers
to get the maximum code size improvement. My results are below:
Regs | .text size | Perf hit
-----+------------+---------
~all | 171619972 | 6.24%
16 | 171765192 | 7.03%
8 | 172917788 | 5.82%
4 | 177054016 | 6.89%
Differential Revision: https://reviews.llvm.org/D56954
llvm-svn: 351920
2019-01-23 10:20:10 +08:00
|
|
|
LowerHWASAN_CHECK_MEMACCESS(*MI);
|
|
|
|
return;
|
|
|
|
|
2018-10-27 14:13:06 +08:00
|
|
|
case AArch64::SEH_StackAlloc:
|
|
|
|
TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveFPLR:
|
|
|
|
TS->EmitARM64WinCFISaveFPLR(MI->getOperand(0).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveFPLR_X:
|
|
|
|
assert(MI->getOperand(0).getImm() < 0 &&
|
|
|
|
"Pre increment SEH opcode must have a negative offset");
|
|
|
|
TS->EmitARM64WinCFISaveFPLRX(-MI->getOperand(0).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveReg:
|
|
|
|
TS->EmitARM64WinCFISaveReg(MI->getOperand(0).getImm(),
|
|
|
|
MI->getOperand(1).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveReg_X:
|
|
|
|
assert(MI->getOperand(1).getImm() < 0 &&
|
|
|
|
"Pre increment SEH opcode must have a negative offset");
|
|
|
|
TS->EmitARM64WinCFISaveRegX(MI->getOperand(0).getImm(),
|
|
|
|
-MI->getOperand(1).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveRegP:
|
|
|
|
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
|
|
|
|
"Non-consecutive registers not allowed for save_regp");
|
|
|
|
TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
|
|
|
|
MI->getOperand(2).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveRegP_X:
|
|
|
|
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
|
|
|
|
"Non-consecutive registers not allowed for save_regp_x");
|
|
|
|
assert(MI->getOperand(2).getImm() < 0 &&
|
|
|
|
"Pre increment SEH opcode must have a negative offset");
|
|
|
|
TS->EmitARM64WinCFISaveRegPX(MI->getOperand(0).getImm(),
|
|
|
|
-MI->getOperand(2).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveFReg:
|
|
|
|
TS->EmitARM64WinCFISaveFReg(MI->getOperand(0).getImm(),
|
|
|
|
MI->getOperand(1).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveFReg_X:
|
|
|
|
assert(MI->getOperand(1).getImm() < 0 &&
|
|
|
|
"Pre increment SEH opcode must have a negative offset");
|
|
|
|
TS->EmitARM64WinCFISaveFRegX(MI->getOperand(0).getImm(),
|
|
|
|
-MI->getOperand(1).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveFRegP:
|
|
|
|
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
|
|
|
|
"Non-consecutive registers not allowed for save_regp");
|
|
|
|
TS->EmitARM64WinCFISaveFRegP(MI->getOperand(0).getImm(),
|
|
|
|
MI->getOperand(2).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SaveFRegP_X:
|
|
|
|
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
|
|
|
|
"Non-consecutive registers not allowed for save_regp_x");
|
|
|
|
assert(MI->getOperand(2).getImm() < 0 &&
|
|
|
|
"Pre increment SEH opcode must have a negative offset");
|
|
|
|
TS->EmitARM64WinCFISaveFRegPX(MI->getOperand(0).getImm(),
|
|
|
|
-MI->getOperand(2).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_SetFP:
|
|
|
|
TS->EmitARM64WinCFISetFP();
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_AddFP:
|
|
|
|
TS->EmitARM64WinCFIAddFP(MI->getOperand(0).getImm());
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_Nop:
|
|
|
|
TS->EmitARM64WinCFINop();
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_PrologEnd:
|
|
|
|
TS->EmitARM64WinCFIPrologEnd();
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_EpilogStart:
|
|
|
|
TS->EmitARM64WinCFIEpilogStart();
|
|
|
|
return;
|
|
|
|
|
|
|
|
case AArch64::SEH_EpilogEnd:
|
|
|
|
TS->EmitARM64WinCFIEpilogEnd();
|
|
|
|
return;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Finally, do the automated lowerings for everything else.
|
|
|
|
MCInst TmpInst;
|
|
|
|
MCInstLowering.Lower(MI, TmpInst);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Force static initialization.
|
2019-06-11 11:21:13 +08:00
|
|
|
extern "C" void LLVMInitializeAArch64AsmPrinter() {
|
2016-10-10 07:00:34 +08:00
|
|
|
RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
|
|
|
|
RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
|
|
|
|
RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
|
2019-09-12 18:22:23 +08:00
|
|
|
RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target());
|
|
|
|
RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target());
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|