2012-02-19 10:03:36 +08:00
|
|
|
//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
|
2005-07-12 09:41:54 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2005-07-12 09:41:54 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2011-07-02 05:01:15 +08:00
|
|
|
// This file declares the X86 specific subclass of TargetSubtargetInfo.
|
2005-07-12 09:41:54 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-08-14 00:26:38 +08:00
|
|
|
#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
|
|
|
|
#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
|
2005-07-12 09:41:54 +08:00
|
|
|
|
2014-06-10 01:08:19 +08:00
|
|
|
#include "X86FrameLowering.h"
|
|
|
|
#include "X86ISelLowering.h"
|
|
|
|
#include "X86InstrInfo.h"
|
|
|
|
#include "X86SelectionDAGInfo.h"
|
2017-02-03 06:55:55 +08:00
|
|
|
#include "llvm/ADT/StringRef.h"
|
2010-07-06 03:26:33 +08:00
|
|
|
#include "llvm/ADT/Triple.h"
|
2017-08-16 06:31:51 +08:00
|
|
|
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
|
|
|
|
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
|
|
|
|
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
|
|
|
|
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
|
2017-11-17 09:07:10 +08:00
|
|
|
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/CallingConv.h"
|
2017-02-03 06:55:55 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
2018-06-09 17:19:40 +08:00
|
|
|
#include <climits>
|
2017-02-03 06:55:55 +08:00
|
|
|
#include <memory>
|
2005-09-02 05:38:21 +08:00
|
|
|
|
2011-07-02 04:45:01 +08:00
|
|
|
#define GET_SUBTARGETINFO_HEADER
|
2011-07-02 06:36:09 +08:00
|
|
|
#include "X86GenSubtargetInfo.inc"
|
2011-07-02 04:45:01 +08:00
|
|
|
|
2005-07-12 09:41:54 +08:00
|
|
|
namespace llvm {
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2006-12-01 06:42:55 +08:00
|
|
|
class GlobalValue;
|
2010-03-01 06:54:30 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// The X86 backend supports a number of different styles of PIC.
|
2010-03-01 06:54:30 +08:00
|
|
|
///
|
2008-11-28 17:29:37 +08:00
|
|
|
namespace PICStyles {
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2007-01-13 03:20:47 +08:00
|
|
|
enum Style {
|
2016-06-21 07:41:56 +08:00
|
|
|
StubPIC, // Used on i386-darwin in pic mode.
|
|
|
|
GOT, // Used on 32 bit elf on when in pic mode.
|
|
|
|
RIPRel, // Used on X86-64 when in pic mode.
|
|
|
|
None // Set when not in pic mode.
|
2007-01-13 03:20:47 +08:00
|
|
|
};
|
2005-07-12 09:41:54 +08:00
|
|
|
|
2017-02-03 06:55:55 +08:00
|
|
|
} // end namespace PICStyles
|
2014-06-10 01:08:19 +08:00
|
|
|
|
2017-02-03 06:55:55 +08:00
|
|
|
class X86Subtarget final : public X86GenSubtargetInfo {
|
2018-07-31 03:41:25 +08:00
|
|
|
public:
|
2018-10-23 07:14:55 +08:00
|
|
|
// NOTE: Do not add anything new to this list. Coarse, CPU name based flags
|
|
|
|
// are not a good idea. We should be migrating away from these.
|
2012-02-02 07:20:51 +08:00
|
|
|
enum X86ProcFamilyEnum {
|
2017-11-20 16:18:12 +08:00
|
|
|
Others,
|
2017-09-13 17:00:27 +08:00
|
|
|
IntelAtom,
|
|
|
|
IntelSLM,
|
|
|
|
IntelGLM,
|
2018-04-16 15:47:35 +08:00
|
|
|
IntelGLP,
|
2018-10-26 01:29:00 +08:00
|
|
|
IntelTRM
|
2012-02-02 07:20:51 +08:00
|
|
|
};
|
|
|
|
|
2017-11-20 16:18:12 +08:00
|
|
|
protected:
|
|
|
|
enum X86SSEEnum {
|
|
|
|
NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
|
|
|
|
};
|
|
|
|
|
|
|
|
enum X863DNowEnum {
|
|
|
|
NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
|
|
|
|
};
|
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// X86 processor family: Intel Atom, and others
|
2018-06-09 17:19:40 +08:00
|
|
|
X86ProcFamilyEnum X86ProcFamily = Others;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Which PIC style to use
|
2008-11-28 17:29:37 +08:00
|
|
|
PICStyles::Style PICStyle;
|
2010-03-01 06:54:30 +08:00
|
|
|
|
2016-05-20 06:07:57 +08:00
|
|
|
const TargetMachine &TM;
|
2016-05-20 02:49:29 +08:00
|
|
|
|
Move the MMX subtarget feature out of the SSE set of features and into
its own variable.
This is needed so that we can explicitly turn off MMX without turning
off SSE and also so that we can diagnose feature set incompatibilities
that involve MMX without SSE.
Rationale:
// sse3
__m128d test_mm_addsub_pd(__m128d A, __m128d B) {
return _mm_addsub_pd(A, B);
}
// mmx
void shift(__m64 a, __m64 b, int c) {
_mm_slli_pi16(a, c);
_mm_slli_pi32(a, c);
_mm_slli_si64(a, c);
_mm_srli_pi16(a, c);
_mm_srli_pi32(a, c);
_mm_srli_si64(a, c);
_mm_srai_pi16(a, c);
_mm_srai_pi32(a, c);
}
clang -msse3 -mno-mmx file.c -c
For this code we should be able to explicitly turn off MMX
without affecting the compilation of the SSE3 function and then
diagnose and error on compiling the MMX function.
This matches the existing gcc behavior and follows the spirit of
the SSE/MMX separation in llvm where we can (and do) turn off
MMX code generation except in the presence of intrinsics.
Updated a couple of tests, but primarily tested with a couple of tests
for turning on only mmx and only sse.
This is paired with a patch to clang to take advantage of this behavior.
llvm-svn: 249731
2015-10-09 04:10:06 +08:00
|
|
|
/// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
|
2018-06-09 17:19:40 +08:00
|
|
|
X86SSEEnum X86SSELevel = NoSSE;
|
2006-01-27 16:10:46 +08:00
|
|
|
|
2015-11-14 11:04:00 +08:00
|
|
|
/// MMX, 3DNow, 3DNow Athlon, or none supported.
|
2018-06-09 17:19:40 +08:00
|
|
|
X863DNowEnum X863DNowLevel = NoThreeDNow;
|
2006-10-06 17:17:41 +08:00
|
|
|
|
2016-03-23 19:13:54 +08:00
|
|
|
/// True if the processor supports X87 instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasX87 = false;
|
2016-03-23 19:13:54 +08:00
|
|
|
|
2019-03-21 07:35:49 +08:00
|
|
|
/// True if the processor supports CMPXCHG8B.
|
|
|
|
bool HasCmpxchg8b = false;
|
|
|
|
|
2018-01-11 06:07:16 +08:00
|
|
|
/// True if this processor has NOPL instruction
|
|
|
|
/// (generally pentium pro+).
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasNOPL = false;
|
2018-01-11 06:07:16 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if this processor has conditional move instructions
|
2009-09-02 13:53:04 +08:00
|
|
|
/// (generally pentium pro+).
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCMov = false;
|
2010-03-01 06:54:30 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the processor supports X86-64 instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasX86_64 = false;
|
2009-01-02 13:35:45 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the processor supports POPCNT.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPOPCNT = false;
|
2010-12-05 04:32:23 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the processor supports SSE4A instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSSE4A = false;
|
2009-05-27 05:04:35 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Target has AES instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasAES = false;
|
|
|
|
bool HasVAES = false;
|
2010-04-03 05:54:27 +08:00
|
|
|
|
2015-10-16 14:03:09 +08:00
|
|
|
/// Target has FXSAVE/FXRESTOR instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFXSR = false;
|
2015-10-16 14:03:09 +08:00
|
|
|
|
2015-10-12 19:47:46 +08:00
|
|
|
/// Target has XSAVE instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasXSAVE = false;
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2015-10-12 19:47:46 +08:00
|
|
|
/// Target has XSAVEOPT instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasXSAVEOPT = false;
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2015-10-12 19:47:46 +08:00
|
|
|
/// Target has XSAVEC instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasXSAVEC = false;
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2015-10-12 19:47:46 +08:00
|
|
|
/// Target has XSAVES instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasXSAVES = false;
|
2015-10-12 19:47:46 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Target has carry-less multiplication
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPCLMUL = false;
|
|
|
|
bool HasVPCLMULQDQ = false;
|
2010-07-23 09:17:51 +08:00
|
|
|
|
2017-11-26 17:36:41 +08:00
|
|
|
/// Target has Galois Field Arithmetic instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasGFNI = false;
|
2017-11-26 17:36:41 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Target has 3-operand fused multiply-add
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFMA = false;
|
2009-06-27 06:46:54 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Target has 4-operand fused multiply-add
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFMA4 = false;
|
2009-06-27 06:46:54 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Target has XOP instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasXOP = false;
|
2011-12-02 23:14:37 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Target has TBM instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasTBM = false;
|
2013-09-25 02:21:52 +08:00
|
|
|
|
2017-05-03 23:51:39 +08:00
|
|
|
/// Target has LWP instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasLWP = false;
|
2017-05-03 23:51:39 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the processor has the MOVBE instruction.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasMOVBE = false;
|
2011-10-04 01:28:23 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the processor has the RDRAND instruction.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasRDRAND = false;
|
2011-10-04 01:28:23 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has 16-bit floating point conversion instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasF16C = false;
|
2011-10-09 15:31:39 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has FS/GS base insturctions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFSGSBase = false;
|
2011-10-31 03:57:21 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has LZCNT instruction.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasLZCNT = false;
|
2011-10-11 14:44:02 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has BMI1 instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasBMI = false;
|
2011-10-14 11:21:46 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has BMI2 instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasBMI2 = false;
|
2011-10-16 15:55:05 +08:00
|
|
|
|
2016-01-17 21:42:12 +08:00
|
|
|
/// Processor has VBMI instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasVBMI = false;
|
2016-01-17 21:42:12 +08:00
|
|
|
|
2017-11-21 17:48:44 +08:00
|
|
|
/// Processor has VBMI2 instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasVBMI2 = false;
|
2017-11-21 17:48:44 +08:00
|
|
|
|
2016-01-24 18:41:28 +08:00
|
|
|
/// Processor has Integer Fused Multiply Add
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasIFMA = false;
|
2016-01-24 18:41:28 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has RTM instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasRTM = false;
|
2012-11-08 15:28:54 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has ADX instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasADX = false;
|
2013-02-15 03:08:21 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has SHA instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSHA = false;
|
2013-09-12 23:51:31 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has PRFCHW instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPRFCHW = false;
|
2013-03-27 01:47:11 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Processor has RDSEED instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasRDSEED = false;
|
2013-03-29 07:41:26 +08:00
|
|
|
|
2015-12-05 07:00:33 +08:00
|
|
|
/// Processor has LAHF/SAHF instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasLAHFSAHF = false;
|
2015-12-05 07:00:33 +08:00
|
|
|
|
2016-05-18 19:59:12 +08:00
|
|
|
/// Processor has MONITORX/MWAITX instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasMWAITX = false;
|
2016-05-18 19:59:12 +08:00
|
|
|
|
2017-02-09 12:27:34 +08:00
|
|
|
/// Processor has Cache Line Zero instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCLZERO = false;
|
2017-02-09 12:27:34 +08:00
|
|
|
|
2018-04-13 15:35:08 +08:00
|
|
|
/// Processor has Cache Line Demote instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCLDEMOTE = false;
|
2018-04-13 15:35:08 +08:00
|
|
|
|
2018-05-01 18:01:16 +08:00
|
|
|
/// Processor has MOVDIRI instruction (direct store integer).
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasMOVDIRI = false;
|
2018-05-01 18:01:16 +08:00
|
|
|
|
|
|
|
/// Processor has MOVDIR64B instruction (direct store 64 bytes).
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasMOVDIR64B = false;
|
2018-05-01 18:01:16 +08:00
|
|
|
|
2018-05-10 15:26:05 +08:00
|
|
|
/// Processor has ptwrite instruction.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPTWRITE = false;
|
2018-05-10 15:26:05 +08:00
|
|
|
|
2016-01-24 18:41:28 +08:00
|
|
|
/// Processor has Prefetch with intent to Write instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPREFETCHWT1 = false;
|
2016-01-24 18:41:28 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if SHLD instructions are slow.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool IsSHLDSlow = false;
|
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction.
AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) | (y >> (64 - c))) when we are not optimizing for size.
It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel.
llvm-svn: 195383
2013-11-22 07:21:26 +08:00
|
|
|
|
2016-12-07 03:35:20 +08:00
|
|
|
/// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
|
|
|
|
// PMULUDQ.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool IsPMULLDSlow = false;
|
2016-12-07 03:35:20 +08:00
|
|
|
|
2018-10-26 01:29:00 +08:00
|
|
|
/// True if the PMADDWD instruction is slow compared to PMULLD.
|
|
|
|
bool IsPMADDWDSlow = false;
|
|
|
|
|
2015-09-02 04:51:51 +08:00
|
|
|
/// True if unaligned memory accesses of 16-bytes are slow.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool IsUAMem16Slow = false;
|
2010-04-01 13:58:17 +08:00
|
|
|
|
2015-08-22 04:17:26 +08:00
|
|
|
/// True if unaligned memory accesses of 32-bytes are slow.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool IsUAMem32Slow = false;
|
2014-12-04 13:20:33 +08:00
|
|
|
|
2015-02-04 01:13:04 +08:00
|
|
|
/// True if SSE operations can have unaligned memory operands.
|
|
|
|
/// This may require setting a configuration bit in the processor.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSSEUnalignedMem = false;
|
2010-01-12 00:29:42 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if this processor has the CMPXCHG16B instruction;
|
2011-08-27 05:21:21 +08:00
|
|
|
/// this is true for most x86-64 chips, but not the first AMD chips.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCmpxchg16b = false;
|
2011-08-27 05:21:21 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the LEA instruction should be used for adjusting
|
2012-02-08 06:50:41 +08:00
|
|
|
/// the stack pointer. This is an optimization for Intel Atom processors.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool UseLeaForSP = false;
|
2012-02-08 06:50:41 +08:00
|
|
|
|
2018-01-22 18:07:01 +08:00
|
|
|
/// True if POPCNT instruction has a false dependency on the destination register.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPOPCNTFalseDeps = false;
|
2018-01-22 18:07:01 +08:00
|
|
|
|
|
|
|
/// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasLZCNTFalseDeps = false;
|
2018-01-22 18:07:01 +08:00
|
|
|
|
2017-12-19 21:16:43 +08:00
|
|
|
/// True if its preferable to combine to a single shuffle using a variable
|
|
|
|
/// mask over multiple fixed shuffles.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastVariableShuffle = false;
|
2017-12-19 21:16:43 +08:00
|
|
|
|
2016-02-13 07:37:57 +08:00
|
|
|
/// True if there is no performance penalty to writing only the lower parts
|
2017-03-03 17:03:24 +08:00
|
|
|
/// of a YMM or ZMM register without clearing the upper part.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastPartialYMMorZMMWrite = false;
|
2016-02-13 07:37:57 +08:00
|
|
|
|
2018-01-30 05:24:31 +08:00
|
|
|
/// True if there is no performance penalty for writing NOPs with up to
|
|
|
|
/// 11 bytes.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFast11ByteNOP = false;
|
2018-01-30 05:24:31 +08:00
|
|
|
|
|
|
|
/// True if there is no performance penalty for writing NOPs with up to
|
|
|
|
/// 15 bytes.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFast15ByteNOP = false;
|
2018-01-30 05:24:31 +08:00
|
|
|
|
2017-11-26 02:09:37 +08:00
|
|
|
/// True if gather is reasonably fast. This is true for Skylake client and
|
|
|
|
/// all AVX-512 CPUs.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastGather = false;
|
2017-11-26 02:09:37 +08:00
|
|
|
|
2016-08-04 20:47:28 +08:00
|
|
|
/// True if hardware SQRTSS instruction is at least as fast (latency) as
|
|
|
|
/// RSQRTSS followed by a Newton-Raphson iteration.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastScalarFSQRT = false;
|
2016-08-04 20:47:28 +08:00
|
|
|
|
|
|
|
/// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
|
|
|
|
/// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastVectorFSQRT = false;
|
2016-08-04 20:47:28 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if 8-bit divisions are significantly faster than
|
2014-11-21 19:19:34 +08:00
|
|
|
/// 32-bit divisions and should be used when possible.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSlowDivide32 = false;
|
2014-11-21 19:19:34 +08:00
|
|
|
|
2017-01-13 03:34:15 +08:00
|
|
|
/// True if 32-bit divides are significantly faster than
|
2014-11-21 19:19:34 +08:00
|
|
|
/// 64-bit divisions and should be used when possible.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSlowDivide64 = false;
|
2012-09-05 02:22:17 +08:00
|
|
|
|
2016-10-15 00:41:38 +08:00
|
|
|
/// True if LZCNT instruction is fast.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastLZCNT = false;
|
2016-10-15 00:41:38 +08:00
|
|
|
|
2017-02-21 14:39:13 +08:00
|
|
|
/// True if SHLD based rotate is fast.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasFastSHLDRotate = false;
|
2017-02-21 14:39:13 +08:00
|
|
|
|
2017-08-30 12:34:48 +08:00
|
|
|
/// True if the processor supports macrofusion.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasMacroFusion = false;
|
2017-08-30 12:34:48 +08:00
|
|
|
|
2019-03-28 22:12:46 +08:00
|
|
|
/// True if the processor supports branch fusion.
|
|
|
|
bool HasBranchFusion = false;
|
|
|
|
|
2017-04-21 17:20:50 +08:00
|
|
|
/// True if the processor has enhanced REP MOVSB/STOSB.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasERMSB = false;
|
2017-04-21 17:20:39 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the short functions should be padded to prevent
|
2013-01-09 02:27:24 +08:00
|
|
|
/// a stall when returning too early.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool PadShortFunctions = false;
|
2013-01-09 02:27:24 +08:00
|
|
|
|
2017-08-29 13:14:27 +08:00
|
|
|
/// True if two memory operand instructions should use a temporary register
|
|
|
|
/// instead.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool SlowTwoMemOps = false;
|
2015-02-04 02:47:32 +08:00
|
|
|
|
|
|
|
/// True if the LEA instruction inputs have to be ready at address generation
|
|
|
|
/// (AG) time.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool LEAUsesAG = false;
|
2013-03-28 03:14:02 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if the LEA instruction with certain arguments is slow
|
2018-06-09 17:19:40 +08:00
|
|
|
bool SlowLEA = false;
|
2014-05-20 16:55:50 +08:00
|
|
|
|
2017-05-18 16:11:50 +08:00
|
|
|
/// True if the LEA instruction has all three source operands: base, index,
|
|
|
|
/// and offset or if the LEA instruction uses base and index registers where
|
|
|
|
/// the base is EBP, RBP,or R13
|
2018-06-09 17:19:40 +08:00
|
|
|
bool Slow3OpsLEA = false;
|
2017-05-18 16:11:50 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if INC and DEC instructions are slow when writing to flags
|
2018-06-09 17:19:40 +08:00
|
|
|
bool SlowIncDec = false;
|
2014-06-09 19:40:41 +08:00
|
|
|
|
2013-07-24 19:02:47 +08:00
|
|
|
/// Processor has AVX-512 PreFetch Instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPFI = false;
|
2014-07-21 22:54:21 +08:00
|
|
|
|
2013-07-24 19:02:47 +08:00
|
|
|
/// Processor has AVX-512 Exponential and Reciprocal Instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasERI = false;
|
2014-07-21 22:54:21 +08:00
|
|
|
|
2013-07-24 19:02:47 +08:00
|
|
|
/// Processor has AVX-512 Conflict Detection Instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCDI = false;
|
2014-07-21 22:54:21 +08:00
|
|
|
|
2017-05-25 21:45:23 +08:00
|
|
|
/// Processor has AVX-512 population count Instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasVPOPCNTDQ = false;
|
2017-05-25 21:45:23 +08:00
|
|
|
|
2014-07-21 22:54:21 +08:00
|
|
|
/// Processor has AVX-512 Doubleword and Quadword instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasDQI = false;
|
2014-07-21 22:54:21 +08:00
|
|
|
|
|
|
|
/// Processor has AVX-512 Byte and Word instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasBWI = false;
|
2014-07-21 22:54:21 +08:00
|
|
|
|
|
|
|
/// Processor has AVX-512 Vector Length eXtenstions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasVLX = false;
|
2014-07-21 22:54:21 +08:00
|
|
|
|
2015-12-15 21:35:29 +08:00
|
|
|
/// Processor has PKU extenstions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPKU = false;
|
2015-12-15 21:35:29 +08:00
|
|
|
|
2017-11-21 18:04:28 +08:00
|
|
|
/// Processor has AVX-512 Vector Neural Network Instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasVNNI = false;
|
2017-11-21 18:04:28 +08:00
|
|
|
|
Enable AVX512_BF16 instructions, which are supported for BFLOAT16 in Cooper Lake
Summary:
1. Enable infrastructure of AVX512_BF16, which is supported for BFLOAT16 in Cooper Lake;
2. Enable VCVTNE2PS2BF16, VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector Neural Network Instructions supporting BFLOAT16 inputs and conversion instructions from IEEE single precision.
VCVTNE2PS2BF16: Convert Two Packed Single Data to One Packed BF16 Data.
VCVTNEPS2BF16: Convert Packed Single Data to Packed BF16 Data.
VDPBF16PS: Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
For more details about BF16 isa, please refer to the latest ISE document: https://software.intel.com/en-us/download/intel-architecture-instruction-set-extensions-programming-reference
Author: LiuTianle
Reviewers: craig.topper, smaslov, LuoYuanke, wxiao3, annita.zhang, RKSimon, spatel
Reviewed By: craig.topper
Subscribers: kristina, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60550
llvm-svn: 360017
2019-05-06 16:22:37 +08:00
|
|
|
/// Processor has AVX-512 bfloat16 floating-point extensions
|
|
|
|
bool HasBF16 = false;
|
|
|
|
|
2017-11-21 18:32:42 +08:00
|
|
|
/// Processor has AVX-512 Bit Algorithms instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasBITALG = false;
|
2017-11-21 18:32:42 +08:00
|
|
|
|
2016-01-24 18:41:28 +08:00
|
|
|
/// Processor supports MPX - Memory Protection Extensions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasMPX = false;
|
2015-06-03 18:30:57 +08:00
|
|
|
|
2017-11-26 21:02:45 +08:00
|
|
|
/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
|
|
|
|
/// using Shadow Stack
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSHSTK = false;
|
2017-11-26 21:02:45 +08:00
|
|
|
|
2018-05-25 14:32:05 +08:00
|
|
|
/// Processor supports Invalidate Process-Context Identifier
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasINVPCID = false;
|
2018-05-25 14:32:05 +08:00
|
|
|
|
2016-01-24 18:41:28 +08:00
|
|
|
/// Processor has Software Guard Extensions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasSGX = false;
|
2016-01-24 18:41:28 +08:00
|
|
|
|
|
|
|
/// Processor supports Flush Cache Line instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCLFLUSHOPT = false;
|
2016-01-24 18:41:28 +08:00
|
|
|
|
|
|
|
/// Processor supports Cache Line Write Back instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasCLWB = false;
|
2016-01-24 18:41:28 +08:00
|
|
|
|
2018-04-12 04:01:57 +08:00
|
|
|
/// Processor supports Write Back No Invalidate instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasWBNOINVD = false;
|
2018-04-12 04:01:57 +08:00
|
|
|
|
2018-01-19 07:52:31 +08:00
|
|
|
/// Processor support RDPID instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasRDPID = false;
|
2018-01-19 07:52:31 +08:00
|
|
|
|
2018-04-21 02:42:47 +08:00
|
|
|
/// Processor supports WaitPKG instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasWAITPKG = false;
|
2018-04-21 02:42:47 +08:00
|
|
|
|
2018-05-08 14:47:36 +08:00
|
|
|
/// Processor supports PCONFIG instruction
|
2018-06-09 17:19:40 +08:00
|
|
|
bool HasPCONFIG = false;
|
2018-05-08 14:47:36 +08:00
|
|
|
|
2018-09-30 11:01:46 +08:00
|
|
|
/// Processor has a single uop BEXTR implementation.
|
|
|
|
bool HasFastBEXTR = false;
|
|
|
|
|
[x86] add and use fast horizontal vector math subtarget feature
This is the planned follow-up to D52997. Here we are reducing horizontal vector math codegen
by default. AMD Jaguar (btver2) should have no difference with this patch because it has
fast-hops. (If we want to set that bit for other CPUs, let me know.)
The code changes are small, but there are many test diffs. For files that are specifically
testing for hops, I added RUNs to distinguish fast/slow, so we can see the consequences
side-by-side. For files that are primarily concerned with codegen other than hops, I just
updated the CHECK lines to reflect the new default codegen.
To recap the recent horizontal op story:
1. Before rL343727, we were producing hops for all subtargets for a variety of patterns.
Hops were likely not optimal for all targets though.
2. The IR improvement in r343727 exposed a hole in the backend hop pattern matching, so
we reduced hop codegen for all subtargets. That was bad for Jaguar (PR39195).
3. We restored the hop codegen for all targets with rL344141. Good for Jaguar, but
probably bad for other CPUs.
4. This patch allows us to distinguish when we want to produce hops, so everyone can be
happy. I'm not sure if we have the best predicate here, but the intent is to undo the
extra hop-iness that was enabled by r344141.
Differential Revision: https://reviews.llvm.org/D53095
llvm-svn: 344361
2018-10-13 00:41:02 +08:00
|
|
|
/// Try harder to combine to horizontal vector ops if they are fast.
|
|
|
|
bool HasFastHorizontalOps = false;
|
|
|
|
|
2019-05-14 23:21:28 +08:00
|
|
|
/// Prefer a left/right scalar logical shifts pair over a shift+and pair.
|
|
|
|
bool HasFastScalarShiftMasks = false;
|
|
|
|
|
2019-04-26 18:49:13 +08:00
|
|
|
/// Prefer a left/right vector logical shifts pair over a shift+and pair.
|
|
|
|
bool HasFastVectorShiftMasks = false;
|
|
|
|
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
/// Use a retpoline thunk rather than indirect calls to block speculative
|
|
|
|
/// execution.
|
2018-08-23 14:06:38 +08:00
|
|
|
bool UseRetpolineIndirectCalls = false;
|
|
|
|
|
|
|
|
/// Use a retpoline thunk or remove any indirect branch to block speculative
|
|
|
|
/// execution.
|
|
|
|
bool UseRetpolineIndirectBranches = false;
|
|
|
|
|
|
|
|
/// Deprecated flag, query `UseRetpolineIndirectCalls` and
|
|
|
|
/// `UseRetpolineIndirectBranches` instead.
|
|
|
|
bool DeprecatedUseRetpoline = false;
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
|
|
|
|
/// When using a retpoline thunk, call an externally provided thunk rather
|
|
|
|
/// than emitting one inside the compiler.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool UseRetpolineExternalThunk = false;
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
|
2015-05-12 09:26:05 +08:00
|
|
|
/// Use software floating point for code generation.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool UseSoftFloat = false;
|
2015-05-12 09:26:05 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// The minimum alignment known to hold of the stack frame on
|
2005-07-12 10:36:10 +08:00
|
|
|
/// entry to the function and which must be maintained by every function.
|
2018-06-09 17:19:40 +08:00
|
|
|
unsigned stackAlignment = 4;
|
2005-07-27 13:53:44 +08:00
|
|
|
|
2007-10-31 19:52:06 +08:00
|
|
|
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
|
2007-08-02 07:45:51 +08:00
|
|
|
///
|
2018-06-09 17:19:40 +08:00
|
|
|
// FIXME: this is a known good value for Yonah. How about others?
|
|
|
|
unsigned MaxInlineSizeThreshold = 128;
|
2011-02-17 20:23:50 +08:00
|
|
|
|
2018-01-20 08:26:08 +08:00
|
|
|
/// Indicates target prefers 256 bit instructions.
|
2018-06-09 17:19:40 +08:00
|
|
|
bool Prefer256Bit = false;
|
2018-01-20 08:26:08 +08:00
|
|
|
|
2018-10-10 06:03:40 +08:00
|
|
|
/// Threeway branch is profitable in this subtarget.
|
|
|
|
bool ThreewayBranchProfitable = false;
|
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// What processor and OS we're targeting.
|
2010-07-06 03:26:33 +08:00
|
|
|
Triple TargetTriple;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2017-08-16 06:31:51 +08:00
|
|
|
/// GlobalISel related APIs.
|
|
|
|
std::unique_ptr<CallLowering> CallLoweringInfo;
|
|
|
|
std::unique_ptr<LegalizerInfo> Legalizer;
|
|
|
|
std::unique_ptr<RegisterBankInfo> RegBankInfo;
|
|
|
|
std::unique_ptr<InstructionSelector> InstSelector;
|
2014-08-09 12:38:53 +08:00
|
|
|
|
2017-02-03 06:55:55 +08:00
|
|
|
private:
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Override the stack alignment.
|
2013-02-16 06:31:27 +08:00
|
|
|
unsigned StackAlignOverride;
|
|
|
|
|
2018-01-20 08:26:08 +08:00
|
|
|
/// Preferred vector width from function attribute.
|
|
|
|
unsigned PreferVectorWidthOverride;
|
|
|
|
|
|
|
|
/// Resolved preferred vector width from function attribute and subtarget
|
|
|
|
/// features.
|
2018-06-09 17:19:40 +08:00
|
|
|
unsigned PreferVectorWidth = UINT32_MAX;
|
2018-01-20 08:26:08 +08:00
|
|
|
|
2018-02-11 16:06:27 +08:00
|
|
|
/// Required vector width from function attribute.
|
|
|
|
unsigned RequiredVectorWidth;
|
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if compiling for 64-bit, false for 16-bit or 32-bit.
|
2011-07-08 05:06:52 +08:00
|
|
|
bool In64BitMode;
|
2006-09-08 14:48:29 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if compiling for 32-bit, false for 16-bit or 64-bit.
|
2014-01-06 12:55:54 +08:00
|
|
|
bool In32BitMode;
|
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// True if compiling for 16-bit, false for 32-bit or 64-bit.
|
2014-01-06 12:55:54 +08:00
|
|
|
bool In16BitMode;
|
|
|
|
|
2017-09-13 17:00:27 +08:00
|
|
|
/// Contains the Overhead of gather\scatter instructions
|
2018-06-09 17:19:40 +08:00
|
|
|
int GatherOverhead = 1024;
|
|
|
|
int ScatterOverhead = 1024;
|
2017-09-13 17:00:27 +08:00
|
|
|
|
2014-06-10 01:08:19 +08:00
|
|
|
X86SelectionDAGInfo TSInfo;
|
2014-06-11 08:25:19 +08:00
|
|
|
// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
|
|
|
|
// X86TargetLowering needs.
|
|
|
|
X86InstrInfo InstrInfo;
|
|
|
|
X86TargetLowering TLInfo;
|
|
|
|
X86FrameLowering FrameLowering;
|
2014-06-10 01:08:19 +08:00
|
|
|
|
2005-07-12 09:41:54 +08:00
|
|
|
public:
|
2005-07-27 13:53:44 +08:00
|
|
|
/// This constructor initializes the data members to match that
|
2009-08-03 06:11:08 +08:00
|
|
|
/// of the specified triple.
|
2005-07-12 09:41:54 +08:00
|
|
|
///
|
2016-05-21 02:16:06 +08:00
|
|
|
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
|
2018-01-20 08:26:08 +08:00
|
|
|
const X86TargetMachine &TM, unsigned StackAlignOverride,
|
2018-02-11 16:06:27 +08:00
|
|
|
unsigned PreferVectorWidthOverride,
|
|
|
|
unsigned RequiredVectorWidth);
|
2014-06-10 01:08:19 +08:00
|
|
|
|
2014-08-05 05:25:23 +08:00
|
|
|
const X86TargetLowering *getTargetLowering() const override {
|
|
|
|
return &TLInfo;
|
|
|
|
}
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2014-08-05 05:25:23 +08:00
|
|
|
const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2014-08-05 05:25:23 +08:00
|
|
|
const X86FrameLowering *getFrameLowering() const override {
|
|
|
|
return &FrameLowering;
|
|
|
|
}
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2014-08-05 05:25:23 +08:00
|
|
|
const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
|
|
|
|
return &TSInfo;
|
|
|
|
}
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2014-08-05 05:25:23 +08:00
|
|
|
const X86RegisterInfo *getRegisterInfo() const override {
|
|
|
|
return &getInstrInfo()->getRegisterInfo();
|
|
|
|
}
|
2005-07-12 10:36:10 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Returns the minimum alignment known to hold of the
|
2005-07-12 10:36:10 +08:00
|
|
|
/// stack frame on entry to the function and which must be maintained by every
|
|
|
|
/// function for this subtarget.
|
2005-07-12 09:41:54 +08:00
|
|
|
unsigned getStackAlignment() const { return stackAlignment; }
|
2005-07-27 13:53:44 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Returns the maximum memset / memcpy size
|
2007-10-31 19:52:06 +08:00
|
|
|
/// that still makes it profitable to inline the call.
|
|
|
|
unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
|
2006-11-21 08:01:06 +08:00
|
|
|
|
|
|
|
/// ParseSubtargetFeatures - Parses features string setting specified
|
2006-10-06 17:17:41 +08:00
|
|
|
/// subtarget options. Definition of function is auto generated by tblgen.
|
2011-07-07 15:07:08 +08:00
|
|
|
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
|
2006-10-06 17:17:41 +08:00
|
|
|
|
2016-11-15 14:34:33 +08:00
|
|
|
/// Methods used by Global ISel
|
|
|
|
const CallLowering *getCallLowering() const override;
|
|
|
|
const InstructionSelector *getInstructionSelector() const override;
|
|
|
|
const LegalizerInfo *getLegalizerInfo() const override;
|
|
|
|
const RegisterBankInfo *getRegBankInfo() const override;
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2013-02-16 09:36:26 +08:00
|
|
|
private:
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Initialize the full set of dependencies so we can use an initializer
|
2014-06-11 08:25:19 +08:00
|
|
|
/// list for X86Subtarget.
|
|
|
|
X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
|
2014-09-04 04:36:31 +08:00
|
|
|
void initSubtargetFeatures(StringRef CPU, StringRef FS);
|
2017-02-03 06:55:55 +08:00
|
|
|
|
2013-02-16 09:36:26 +08:00
|
|
|
public:
|
2013-01-26 06:07:43 +08:00
|
|
|
/// Is this x86_64? (disregarding specific ABI / programming model)
|
|
|
|
bool is64Bit() const {
|
|
|
|
return In64BitMode;
|
|
|
|
}
|
|
|
|
|
2014-01-06 12:55:54 +08:00
|
|
|
bool is32Bit() const {
|
|
|
|
return In32BitMode;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is16Bit() const {
|
|
|
|
return In16BitMode;
|
|
|
|
}
|
|
|
|
|
2013-01-26 06:07:43 +08:00
|
|
|
/// Is this x86_64 with the ILP32 programming model (x32 ABI)?
|
|
|
|
bool isTarget64BitILP32() const {
|
2013-12-19 08:44:37 +08:00
|
|
|
return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
|
2014-11-23 03:12:10 +08:00
|
|
|
TargetTriple.isOSNaCl());
|
2013-01-26 06:07:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
|
|
|
|
bool isTarget64BitLP64() const {
|
2014-08-07 17:41:19 +08:00
|
|
|
return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
|
2014-11-23 03:12:10 +08:00
|
|
|
!TargetTriple.isOSNaCl());
|
2013-01-26 06:07:43 +08:00
|
|
|
}
|
2006-01-26 17:53:06 +08:00
|
|
|
|
2008-11-28 17:29:37 +08:00
|
|
|
PICStyles::Style getPICStyle() const { return PICStyle; }
|
|
|
|
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
|
2007-01-13 03:20:47 +08:00
|
|
|
|
2016-03-23 19:13:54 +08:00
|
|
|
bool hasX87() const { return HasX87; }
|
2019-03-21 07:35:49 +08:00
|
|
|
bool hasCmpxchg8b() const { return HasCmpxchg8b; }
|
2018-01-11 06:07:16 +08:00
|
|
|
bool hasNOPL() const { return HasNOPL; }
|
2018-08-27 02:29:33 +08:00
|
|
|
// SSE codegen depends on cmovs, and all SSE1+ processors support them.
|
|
|
|
// All 64-bit processors support cmov.
|
|
|
|
bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); }
|
2012-01-10 14:30:56 +08:00
|
|
|
bool hasSSE1() const { return X86SSELevel >= SSE1; }
|
|
|
|
bool hasSSE2() const { return X86SSELevel >= SSE2; }
|
|
|
|
bool hasSSE3() const { return X86SSELevel >= SSE3; }
|
|
|
|
bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
|
|
|
|
bool hasSSE41() const { return X86SSELevel >= SSE41; }
|
|
|
|
bool hasSSE42() const { return X86SSELevel >= SSE42; }
|
2012-01-10 14:54:16 +08:00
|
|
|
bool hasAVX() const { return X86SSELevel >= AVX; }
|
|
|
|
bool hasAVX2() const { return X86SSELevel >= AVX2; }
|
2013-08-21 11:57:57 +08:00
|
|
|
bool hasAVX512() const { return X86SSELevel >= AVX512F; }
|
2012-11-29 20:44:59 +08:00
|
|
|
bool hasInt256() const { return hasAVX2(); }
|
2009-05-27 05:04:35 +08:00
|
|
|
bool hasSSE4A() const { return HasSSE4A; }
|
2015-11-14 11:04:00 +08:00
|
|
|
bool hasMMX() const { return X863DNowLevel >= MMX; }
|
2006-10-06 17:17:41 +08:00
|
|
|
bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
|
|
|
|
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
|
2010-12-05 04:32:23 +08:00
|
|
|
bool hasPOPCNT() const { return HasPOPCNT; }
|
2010-04-03 05:54:27 +08:00
|
|
|
bool hasAES() const { return HasAES; }
|
2017-11-21 17:11:41 +08:00
|
|
|
bool hasVAES() const { return HasVAES; }
|
2015-10-16 14:03:09 +08:00
|
|
|
bool hasFXSR() const { return HasFXSR; }
|
2015-10-12 19:47:46 +08:00
|
|
|
bool hasXSAVE() const { return HasXSAVE; }
|
|
|
|
bool hasXSAVEOPT() const { return HasXSAVEOPT; }
|
|
|
|
bool hasXSAVEC() const { return HasXSAVEC; }
|
|
|
|
bool hasXSAVES() const { return HasXSAVES; }
|
2012-05-31 22:34:17 +08:00
|
|
|
bool hasPCLMUL() const { return HasPCLMUL; }
|
2017-11-21 17:30:33 +08:00
|
|
|
bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
|
2017-11-26 17:36:41 +08:00
|
|
|
bool hasGFNI() const { return HasGFNI; }
|
2015-12-01 06:22:06 +08:00
|
|
|
// Prefer FMA4 to FMA - its better for commutation/memory folding and
|
|
|
|
// has equal or better performance on all supported targets.
|
2017-11-26 02:32:43 +08:00
|
|
|
bool hasFMA() const { return HasFMA; }
|
2015-12-01 06:22:06 +08:00
|
|
|
bool hasFMA4() const { return HasFMA4; }
|
2017-03-17 15:37:31 +08:00
|
|
|
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
|
2011-12-02 23:14:37 +08:00
|
|
|
bool hasXOP() const { return HasXOP; }
|
2013-09-25 02:21:52 +08:00
|
|
|
bool hasTBM() const { return HasTBM; }
|
2017-05-03 23:51:39 +08:00
|
|
|
bool hasLWP() const { return HasLWP; }
|
2011-10-04 01:28:23 +08:00
|
|
|
bool hasMOVBE() const { return HasMOVBE; }
|
|
|
|
bool hasRDRAND() const { return HasRDRAND; }
|
2011-10-09 15:31:39 +08:00
|
|
|
bool hasF16C() const { return HasF16C; }
|
2011-10-31 03:57:21 +08:00
|
|
|
bool hasFSGSBase() const { return HasFSGSBase; }
|
2011-10-11 14:44:02 +08:00
|
|
|
bool hasLZCNT() const { return HasLZCNT; }
|
2011-10-14 11:21:46 +08:00
|
|
|
bool hasBMI() const { return HasBMI; }
|
2011-10-16 15:55:05 +08:00
|
|
|
bool hasBMI2() const { return HasBMI2; }
|
2016-01-17 21:42:12 +08:00
|
|
|
bool hasVBMI() const { return HasVBMI; }
|
2017-11-21 17:48:44 +08:00
|
|
|
bool hasVBMI2() const { return HasVBMI2; }
|
2016-01-24 18:41:28 +08:00
|
|
|
bool hasIFMA() const { return HasIFMA; }
|
2012-11-08 15:28:54 +08:00
|
|
|
bool hasRTM() const { return HasRTM; }
|
2013-02-15 03:08:21 +08:00
|
|
|
bool hasADX() const { return HasADX; }
|
2013-09-12 23:51:31 +08:00
|
|
|
bool hasSHA() const { return HasSHA; }
|
2017-12-22 10:30:30 +08:00
|
|
|
bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; }
|
|
|
|
bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
|
|
|
|
bool hasSSEPrefetch() const {
|
|
|
|
// We implicitly enable these when we have a write prefix supporting cache
|
|
|
|
// level OR if we have prfchw, but don't already have a read prefetch from
|
|
|
|
// 3dnow.
|
|
|
|
return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
|
|
|
|
}
|
2013-03-29 07:41:26 +08:00
|
|
|
bool hasRDSEED() const { return HasRDSEED; }
|
2015-12-05 07:00:33 +08:00
|
|
|
bool hasLAHFSAHF() const { return HasLAHFSAHF; }
|
2016-05-18 19:59:12 +08:00
|
|
|
bool hasMWAITX() const { return HasMWAITX; }
|
2017-02-09 12:27:34 +08:00
|
|
|
bool hasCLZERO() const { return HasCLZERO; }
|
2018-04-13 15:35:08 +08:00
|
|
|
bool hasCLDEMOTE() const { return HasCLDEMOTE; }
|
2018-05-01 18:01:16 +08:00
|
|
|
bool hasMOVDIRI() const { return HasMOVDIRI; }
|
|
|
|
bool hasMOVDIR64B() const { return HasMOVDIR64B; }
|
2018-05-10 15:26:05 +08:00
|
|
|
bool hasPTWRITE() const { return HasPTWRITE; }
|
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction.
AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) | (y >> (64 - c))) when we are not optimizing for size.
It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel.
llvm-svn: 195383
2013-11-22 07:21:26 +08:00
|
|
|
bool isSHLDSlow() const { return IsSHLDSlow; }
|
2016-12-07 03:35:20 +08:00
|
|
|
bool isPMULLDSlow() const { return IsPMULLDSlow; }
|
2018-10-26 01:29:00 +08:00
|
|
|
bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
|
2015-09-02 04:51:51 +08:00
|
|
|
bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
|
2014-11-22 01:40:04 +08:00
|
|
|
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
|
2017-09-13 17:00:27 +08:00
|
|
|
int getGatherOverhead() const { return GatherOverhead; }
|
|
|
|
int getScatterOverhead() const { return ScatterOverhead; }
|
2015-02-04 01:13:04 +08:00
|
|
|
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
|
2019-03-14 02:48:50 +08:00
|
|
|
bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
|
2012-02-08 06:50:41 +08:00
|
|
|
bool useLeaForSP() const { return UseLeaForSP; }
|
2018-01-22 18:07:01 +08:00
|
|
|
bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
|
|
|
|
bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
|
2017-12-19 21:16:43 +08:00
|
|
|
bool hasFastVariableShuffle() const {
|
|
|
|
return HasFastVariableShuffle;
|
|
|
|
}
|
2017-03-03 17:03:24 +08:00
|
|
|
bool hasFastPartialYMMorZMMWrite() const {
|
|
|
|
return HasFastPartialYMMorZMMWrite;
|
|
|
|
}
|
2017-11-26 02:09:37 +08:00
|
|
|
bool hasFastGather() const { return HasFastGather; }
|
2016-08-04 20:47:28 +08:00
|
|
|
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
|
|
|
|
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
|
2016-10-15 00:41:38 +08:00
|
|
|
bool hasFastLZCNT() const { return HasFastLZCNT; }
|
2017-02-21 14:39:13 +08:00
|
|
|
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
|
2018-09-30 11:01:46 +08:00
|
|
|
bool hasFastBEXTR() const { return HasFastBEXTR; }
|
[x86] add and use fast horizontal vector math subtarget feature
This is the planned follow-up to D52997. Here we are reducing horizontal vector math codegen
by default. AMD Jaguar (btver2) should have no difference with this patch because it has
fast-hops. (If we want to set that bit for other CPUs, let me know.)
The code changes are small, but there are many test diffs. For files that are specifically
testing for hops, I added RUNs to distinguish fast/slow, so we can see the consequences
side-by-side. For files that are primarily concerned with codegen other than hops, I just
updated the CHECK lines to reflect the new default codegen.
To recap the recent horizontal op story:
1. Before rL343727, we were producing hops for all subtargets for a variety of patterns.
Hops were likely not optimal for all targets though.
2. The IR improvement in r343727 exposed a hole in the backend hop pattern matching, so
we reduced hop codegen for all subtargets. That was bad for Jaguar (PR39195).
3. We restored the hop codegen for all targets with rL344141. Good for Jaguar, but
probably bad for other CPUs.
4. This patch allows us to distinguish when we want to produce hops, so everyone can be
happy. I'm not sure if we have the best predicate here, but the intent is to undo the
extra hop-iness that was enabled by r344141.
Differential Revision: https://reviews.llvm.org/D53095
llvm-svn: 344361
2018-10-13 00:41:02 +08:00
|
|
|
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
|
2019-05-14 23:21:28 +08:00
|
|
|
bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
|
2019-04-26 18:49:13 +08:00
|
|
|
bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
|
2017-08-30 12:34:48 +08:00
|
|
|
bool hasMacroFusion() const { return HasMacroFusion; }
|
2019-03-28 22:12:46 +08:00
|
|
|
bool hasBranchFusion() const { return HasBranchFusion; }
|
2017-04-21 17:20:50 +08:00
|
|
|
bool hasERMSB() const { return HasERMSB; }
|
2014-11-21 19:19:34 +08:00
|
|
|
bool hasSlowDivide32() const { return HasSlowDivide32; }
|
|
|
|
bool hasSlowDivide64() const { return HasSlowDivide64; }
|
2013-01-09 02:27:24 +08:00
|
|
|
bool padShortFunctions() const { return PadShortFunctions; }
|
2017-08-29 13:14:27 +08:00
|
|
|
bool slowTwoMemOps() const { return SlowTwoMemOps; }
|
2013-04-26 04:29:37 +08:00
|
|
|
bool LEAusesAG() const { return LEAUsesAG; }
|
2014-05-20 16:55:50 +08:00
|
|
|
bool slowLEA() const { return SlowLEA; }
|
2017-05-18 16:11:50 +08:00
|
|
|
bool slow3OpsLEA() const { return Slow3OpsLEA; }
|
2014-06-09 19:40:41 +08:00
|
|
|
bool slowIncDec() const { return SlowIncDec; }
|
2013-07-24 19:02:47 +08:00
|
|
|
bool hasCDI() const { return HasCDI; }
|
2017-05-25 21:45:23 +08:00
|
|
|
bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
|
2013-07-24 19:02:47 +08:00
|
|
|
bool hasPFI() const { return HasPFI; }
|
|
|
|
bool hasERI() const { return HasERI; }
|
2014-07-21 22:54:21 +08:00
|
|
|
bool hasDQI() const { return HasDQI; }
|
|
|
|
bool hasBWI() const { return HasBWI; }
|
|
|
|
bool hasVLX() const { return HasVLX; }
|
2015-12-15 21:35:29 +08:00
|
|
|
bool hasPKU() const { return HasPKU; }
|
2017-11-21 18:04:28 +08:00
|
|
|
bool hasVNNI() const { return HasVNNI; }
|
Enable AVX512_BF16 instructions, which are supported for BFLOAT16 in Cooper Lake
Summary:
1. Enable infrastructure of AVX512_BF16, which is supported for BFLOAT16 in Cooper Lake;
2. Enable VCVTNE2PS2BF16, VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector Neural Network Instructions supporting BFLOAT16 inputs and conversion instructions from IEEE single precision.
VCVTNE2PS2BF16: Convert Two Packed Single Data to One Packed BF16 Data.
VCVTNEPS2BF16: Convert Packed Single Data to Packed BF16 Data.
VDPBF16PS: Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
For more details about BF16 isa, please refer to the latest ISE document: https://software.intel.com/en-us/download/intel-architecture-instruction-set-extensions-programming-reference
Author: LiuTianle
Reviewers: craig.topper, smaslov, LuoYuanke, wxiao3, annita.zhang, RKSimon, spatel
Reviewed By: craig.topper
Subscribers: kristina, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60550
llvm-svn: 360017
2019-05-06 16:22:37 +08:00
|
|
|
bool hasBF16() const { return HasBF16; }
|
2017-11-21 18:32:42 +08:00
|
|
|
bool hasBITALG() const { return HasBITALG; }
|
2015-06-03 18:30:57 +08:00
|
|
|
bool hasMPX() const { return HasMPX; }
|
2017-11-26 21:02:45 +08:00
|
|
|
bool hasSHSTK() const { return HasSHSTK; }
|
2017-02-08 13:45:46 +08:00
|
|
|
bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
|
2017-08-30 07:13:36 +08:00
|
|
|
bool hasCLWB() const { return HasCLWB; }
|
2018-04-12 04:01:57 +08:00
|
|
|
bool hasWBNOINVD() const { return HasWBNOINVD; }
|
2018-01-19 07:52:31 +08:00
|
|
|
bool hasRDPID() const { return HasRDPID; }
|
2018-04-21 02:42:47 +08:00
|
|
|
bool hasWAITPKG() const { return HasWAITPKG; }
|
2018-05-08 14:47:36 +08:00
|
|
|
bool hasPCONFIG() const { return HasPCONFIG; }
|
2018-05-08 15:11:05 +08:00
|
|
|
bool hasSGX() const { return HasSGX; }
|
2018-10-10 06:03:40 +08:00
|
|
|
bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
|
2018-05-25 14:32:05 +08:00
|
|
|
bool hasINVPCID() const { return HasINVPCID; }
|
2018-08-23 14:06:38 +08:00
|
|
|
bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
|
|
|
|
bool useRetpolineIndirectBranches() const {
|
|
|
|
return UseRetpolineIndirectBranches;
|
|
|
|
}
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
|
2009-01-02 13:35:45 +08:00
|
|
|
|
2018-01-20 08:26:08 +08:00
|
|
|
unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
|
2018-02-11 16:06:27 +08:00
|
|
|
unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
|
2018-01-20 08:26:08 +08:00
|
|
|
|
2018-01-20 08:26:12 +08:00
|
|
|
// Helper functions to determine when we should allow widening to 512-bit
|
|
|
|
// during codegen.
|
|
|
|
// TODO: Currently we're always allowing widening on CPUs without VLX,
|
|
|
|
// because for many cases we don't have a better option.
|
|
|
|
bool canExtendTo512DQ() const {
|
|
|
|
return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
|
|
|
|
}
|
|
|
|
bool canExtendTo512BW() const {
|
|
|
|
return hasBWI() && canExtendTo512DQ();
|
|
|
|
}
|
|
|
|
|
2018-02-11 16:06:27 +08:00
|
|
|
// If there are no 512-bit vectors and we prefer not to use 512-bit registers,
|
|
|
|
// disable them in the legalizer.
|
|
|
|
bool useAVX512Regs() const {
|
|
|
|
return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool useBWIRegs() const {
|
|
|
|
return hasBWI() && useAVX512Regs();
|
|
|
|
}
|
|
|
|
|
2017-02-03 06:55:55 +08:00
|
|
|
bool isXRaySupported() const override { return is64Bit(); }
|
2016-09-19 08:54:35 +08:00
|
|
|
|
2017-09-13 17:00:27 +08:00
|
|
|
X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }
|
|
|
|
|
|
|
|
/// TODO: to be removed later and replaced with suitable properties
|
2012-02-02 07:20:51 +08:00
|
|
|
bool isAtom() const { return X86ProcFamily == IntelAtom; }
|
2014-05-20 16:55:50 +08:00
|
|
|
bool isSLM() const { return X86ProcFamily == IntelSLM; }
|
2018-04-16 15:47:35 +08:00
|
|
|
bool isGLM() const {
|
|
|
|
return X86ProcFamily == IntelGLM ||
|
|
|
|
X86ProcFamily == IntelGLP ||
|
|
|
|
X86ProcFamily == IntelTRM;
|
|
|
|
}
|
2015-05-12 09:26:05 +08:00
|
|
|
bool useSoftFloat() const { return UseSoftFloat; }
|
2012-02-02 07:20:51 +08:00
|
|
|
|
2016-02-14 01:26:29 +08:00
|
|
|
/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
|
|
|
|
/// no-sse2). There isn't any reason to disable it if the target processor
|
|
|
|
/// supports it.
|
|
|
|
bool hasMFence() const { return hasSSE2() || is64Bit(); }
|
|
|
|
|
2011-04-20 05:01:47 +08:00
|
|
|
const Triple &getTargetTriple() const { return TargetTriple; }
|
|
|
|
|
2011-04-20 05:14:45 +08:00
|
|
|
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
|
2014-11-23 03:12:10 +08:00
|
|
|
bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
|
2014-12-29 23:47:28 +08:00
|
|
|
bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
|
2014-11-23 03:12:10 +08:00
|
|
|
bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
|
2016-12-01 07:14:27 +08:00
|
|
|
bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
|
2013-12-11 00:57:43 +08:00
|
|
|
|
|
|
|
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
|
|
|
|
bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
|
2014-12-05 08:22:38 +08:00
|
|
|
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
|
2013-12-11 00:57:43 +08:00
|
|
|
|
2013-08-30 04:23:14 +08:00
|
|
|
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
|
2016-05-05 19:35:51 +08:00
|
|
|
bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
|
|
|
|
bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
|
2015-10-09 05:21:24 +08:00
|
|
|
bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
|
2013-08-30 04:23:14 +08:00
|
|
|
bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
|
2011-09-06 05:51:43 +08:00
|
|
|
bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
|
|
|
|
bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
|
2015-10-27 15:23:59 +08:00
|
|
|
bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
|
2017-02-24 11:10:10 +08:00
|
|
|
bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
|
2014-04-02 12:27:51 +08:00
|
|
|
|
|
|
|
bool isTargetWindowsMSVC() const {
|
|
|
|
return TargetTriple.isWindowsMSVCEnvironment();
|
|
|
|
}
|
|
|
|
|
2014-04-02 02:15:34 +08:00
|
|
|
bool isTargetKnownWindowsMSVC() const {
|
2014-03-30 12:35:00 +08:00
|
|
|
return TargetTriple.isKnownWindowsMSVCEnvironment();
|
2014-03-28 06:50:05 +08:00
|
|
|
}
|
2014-04-02 12:27:51 +08:00
|
|
|
|
2015-08-15 06:41:43 +08:00
|
|
|
bool isTargetWindowsCoreCLR() const {
|
|
|
|
return TargetTriple.isWindowsCoreCLREnvironment();
|
|
|
|
}
|
|
|
|
|
2014-04-02 12:27:51 +08:00
|
|
|
bool isTargetWindowsCygwin() const {
|
2014-03-28 06:50:05 +08:00
|
|
|
return TargetTriple.isWindowsCygwinEnvironment();
|
|
|
|
}
|
2014-04-02 12:27:51 +08:00
|
|
|
|
|
|
|
bool isTargetWindowsGNU() const {
|
|
|
|
return TargetTriple.isWindowsGNUEnvironment();
|
|
|
|
}
|
|
|
|
|
2014-11-21 02:01:26 +08:00
|
|
|
bool isTargetWindowsItanium() const {
|
|
|
|
return TargetTriple.isWindowsItaniumEnvironment();
|
|
|
|
}
|
|
|
|
|
2012-02-05 16:26:40 +08:00
|
|
|
bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
|
2010-03-01 06:54:30 +08:00
|
|
|
|
2013-10-24 07:37:01 +08:00
|
|
|
bool isOSWindows() const { return TargetTriple.isOSWindows(); }
|
|
|
|
|
[codeview] Implement FPO data assembler directives
Summary:
This adds a set of new directives that describe 32-bit x86 prologues.
The directives are limited and do not expose the full complexity of
codeview FPO data. They are merely a convenience for the compiler to
generate more readable assembly so we don't need to generate tons of
labels in CodeGen. If our prologue emission changes in the future, we
can change the set of available directives to suit our needs. These are
modelled after the .seh_ directives, which use a different format that
interacts with exception handling.
The directives are:
.cv_fpo_proc _foo
.cv_fpo_pushreg ebp/ebx/etc
.cv_fpo_setframe ebp/esi/etc
.cv_fpo_stackalloc 200
.cv_fpo_endprologue
.cv_fpo_endproc
.cv_fpo_data _foo
I tried to follow the implementation of ARM EHABI CFI directives by
sinking most directives out of MCStreamer and into X86TargetStreamer.
This helps avoid polluting non-X86 code with WinCOFF specific logic.
I used cdb to confirm that this can show locals in parent CSRs in a few
cases, most importantly the one where we use ESI as a frame pointer,
i.e. the one in http://crbug.com/756153#c28
Once we have cdb integration in debuginfo-tests, we can add integration
tests there.
Reviewers: majnemer, hans
Subscribers: aemerson, mgorny, kristof.beyls, llvm-commits, hiraditya
Differential Revision: https://reviews.llvm.org/D38776
llvm-svn: 315513
2017-10-12 05:24:33 +08:00
|
|
|
bool isTargetWin64() const { return In64BitMode && isOSWindows(); }
|
2011-02-01 09:14:13 +08:00
|
|
|
|
[codeview] Implement FPO data assembler directives
Summary:
This adds a set of new directives that describe 32-bit x86 prologues.
The directives are limited and do not expose the full complexity of
codeview FPO data. They are merely a convenience for the compiler to
generate more readable assembly so we don't need to generate tons of
labels in CodeGen. If our prologue emission changes in the future, we
can change the set of available directives to suit our needs. These are
modelled after the .seh_ directives, which use a different format that
interacts with exception handling.
The directives are:
.cv_fpo_proc _foo
.cv_fpo_pushreg ebp/ebx/etc
.cv_fpo_setframe ebp/esi/etc
.cv_fpo_stackalloc 200
.cv_fpo_endprologue
.cv_fpo_endproc
.cv_fpo_data _foo
I tried to follow the implementation of ARM EHABI CFI directives by
sinking most directives out of MCStreamer and into X86TargetStreamer.
This helps avoid polluting non-X86 code with WinCOFF specific logic.
I used cdb to confirm that this can show locals in parent CSRs in a few
cases, most importantly the one where we use ESI as a frame pointer,
i.e. the one in http://crbug.com/756153#c28
Once we have cdb integration in debuginfo-tests, we can add integration
tests there.
Reviewers: majnemer, hans
Subscribers: aemerson, mgorny, kristof.beyls, llvm-commits, hiraditya
Differential Revision: https://reviews.llvm.org/D38776
llvm-svn: 315513
2017-10-12 05:24:33 +08:00
|
|
|
bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
|
2010-09-03 07:03:46 +08:00
|
|
|
|
2008-11-28 17:29:37 +08:00
|
|
|
bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
|
|
|
|
bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
|
2009-07-11 04:47:30 +08:00
|
|
|
|
2009-07-11 05:00:45 +08:00
|
|
|
bool isPICStyleStubPIC() const {
|
2009-07-11 04:58:47 +08:00
|
|
|
return PICStyle == PICStyles::StubPIC;
|
|
|
|
}
|
|
|
|
|
2016-06-28 05:33:08 +08:00
|
|
|
bool isPositionIndependent() const { return TM.isPositionIndependent(); }
|
2016-06-18 08:03:20 +08:00
|
|
|
|
2013-07-12 14:02:35 +08:00
|
|
|
bool isCallingConvWin64(CallingConv::ID CC) const {
|
2015-07-09 05:03:47 +08:00
|
|
|
switch (CC) {
|
|
|
|
// On Win64, all these conventions just use the default convention.
|
|
|
|
case CallingConv::C:
|
|
|
|
case CallingConv::Fast:
|
2017-09-21 05:00:40 +08:00
|
|
|
case CallingConv::Swift:
|
2015-07-09 05:03:47 +08:00
|
|
|
case CallingConv::X86_FastCall:
|
|
|
|
case CallingConv::X86_StdCall:
|
|
|
|
case CallingConv::X86_ThisCall:
|
|
|
|
case CallingConv::X86_VectorCall:
|
|
|
|
case CallingConv::Intel_OCL_BI:
|
|
|
|
return isTargetWin64();
|
|
|
|
// This convention allows using the Win64 convention on other targets.
|
2017-07-18 04:05:19 +08:00
|
|
|
case CallingConv::Win64:
|
2015-07-09 05:03:47 +08:00
|
|
|
return true;
|
|
|
|
// This convention allows using the SysV convention on Windows targets.
|
|
|
|
case CallingConv::X86_64_SysV:
|
|
|
|
return false;
|
|
|
|
// Otherwise, who knows what this is.
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
2013-07-12 14:02:35 +08:00
|
|
|
}
|
2010-03-01 06:54:30 +08:00
|
|
|
|
2016-05-20 02:34:20 +08:00
|
|
|
/// Classify a global variable reference for the current subtarget according
|
|
|
|
/// to how we should reference it in a non-pcrel context.
|
2016-05-20 20:20:10 +08:00
|
|
|
unsigned char classifyLocalReference(const GlobalValue *GV) const;
|
|
|
|
|
|
|
|
unsigned char classifyGlobalReference(const GlobalValue *GV,
|
|
|
|
const Module &M) const;
|
2016-05-20 06:07:57 +08:00
|
|
|
unsigned char classifyGlobalReference(const GlobalValue *GV) const;
|
2006-12-20 09:03:20 +08:00
|
|
|
|
2016-05-20 02:34:20 +08:00
|
|
|
/// Classify a global function reference for the current subtarget.
|
2016-05-20 20:20:10 +08:00
|
|
|
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
|
|
|
|
const Module &M) const;
|
2016-05-20 02:49:29 +08:00
|
|
|
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
|
2016-04-20 16:32:57 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Classify a blockaddress reference for the current subtarget according to
|
|
|
|
/// how we should reference it in a non-pcrel context.
|
2016-05-20 02:34:20 +08:00
|
|
|
unsigned char classifyBlockAddressReference() const;
|
2009-11-21 07:18:13 +08:00
|
|
|
|
2015-02-04 02:47:32 +08:00
|
|
|
/// Return true if the subtarget allows calls to immediate address.
|
2016-05-20 02:49:29 +08:00
|
|
|
bool isLegalToCallImmediateAddr() const;
|
2009-05-20 12:53:57 +08:00
|
|
|
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
/// If we are using retpolines, we need to expand indirectbr to avoid it
|
|
|
|
/// lowering to an actual indirect jump.
|
2018-08-23 14:06:38 +08:00
|
|
|
bool enableIndirectBrExpand() const override {
|
|
|
|
return useRetpolineIndirectBranches();
|
|
|
|
}
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
|
2013-10-16 07:33:07 +08:00
|
|
|
/// Enable the MachineScheduler pass for all X86 subtargets.
|
2014-03-02 17:09:27 +08:00
|
|
|
bool enableMachineScheduler() const override { return true; }
|
2013-10-16 07:33:07 +08:00
|
|
|
|
2014-05-22 07:40:26 +08:00
|
|
|
bool enableEarlyIfConversion() const override;
|
|
|
|
|
2019-04-01 22:08:26 +08:00
|
|
|
void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
|
|
|
|
&Mutations) const override;
|
2019-04-01 21:48:50 +08:00
|
|
|
|
2014-07-16 06:39:58 +08:00
|
|
|
AntiDepBreakMode getAntiDepBreakMode() const override {
|
|
|
|
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
|
|
|
|
}
|
Add logic to greedy reg alloc to avoid bad eviction chains
This fixes bugzilla 26810
https://bugs.llvm.org/show_bug.cgi?id=26810
This is intended to prevent sequences like:
movl %ebp, 8(%esp) # 4-byte Spill
movl %ecx, %ebp
movl %ebx, %ecx
movl %edi, %ebx
movl %edx, %edi
cltd
idivl %esi
movl %edi, %edx
movl %ebx, %edi
movl %ecx, %ebx
movl %ebp, %ecx
movl 16(%esp), %ebp # 4 - byte Reload
Such sequences are created in 2 scenarios:
Scenario #1:
vreg0 is evicted from physreg0 by vreg1
Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from)
Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.)
one of the split intervals ends up evicting vreg2 from physreg1
Evictee vreg2 is intended for region splitting with split candidate physreg1
one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills
Scenario #2
vreg0 is evicted from physreg0 by vreg1
vreg2 is evicted from physreg2 by vreg3 etc
Evictee vreg0 is intended for region splitting with split candidate physreg1
Region splitting creates a local interval because of interference with the evictor vreg1
one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from)
Another evictee vreg2 is intended for region splitting with split candidate physreg1
one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills
As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest).
Differential Revision: https://reviews.llvm.org/D35816
Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39
llvm-svn: 316295
2017-10-23 01:59:38 +08:00
|
|
|
|
2017-10-23 03:16:31 +08:00
|
|
|
bool enableAdvancedRASplitCost() const override { return true; }
|
2009-09-03 12:37:05 +08:00
|
|
|
};
|
2006-10-17 05:00:37 +08:00
|
|
|
|
2017-02-03 06:55:55 +08:00
|
|
|
} // end namespace llvm
|
2005-07-12 09:41:54 +08:00
|
|
|
|
2017-02-03 06:55:55 +08:00
|
|
|
#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
|