[X86] Rename Subtarget Tuning Feature Flag Prefix. NFC.

As suggested on D107370, this patch renames the tuning feature flags to start with 'Tuning' instead of 'Feature'.

Differential Revision: https://reviews.llvm.org/D107459
This commit is contained in:
Simon Pilgrim 2021-08-05 12:05:02 +01:00
parent cc947e29ea
commit e78bf49a58
4 changed files with 261 additions and 259 deletions

View File

@ -1086,11 +1086,11 @@ unsigned X86AsmBackend::getMaximumNopSize() const {
return 4;
if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
return 1;
if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
if (STI.getFeatureBits()[X86::TuningFast7ByteNOP])
return 7;
if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
if (STI.getFeatureBits()[X86::TuningFast15ByteNOP])
return 15;
if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
if (STI.getFeatureBits()[X86::TuningFast11ByteNOP])
return 11;
// FIXME: handle 32-bit mode
// 15-bytes is the longest single NOP instruction, but 10-bytes is

View File

@ -367,120 +367,120 @@ def FeatureLVILoadHardening
// X86 Subtarget Tuning features
//===----------------------------------------------------------------------===//
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
"PMULLD instruction is slow">;
def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
"true",
"PMADDWD is slower than PMULLD">;
// FIXME: This should not apply to CPUs that do not have SSE.
def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
def TuningSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
"IsUAMem16Slow", "true",
"Slow unaligned 16-byte memory access">;
def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
def TuningSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
"IsUAMem32Slow", "true",
"Slow unaligned 32-byte memory access">;
def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
def TuningLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
def TuningSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
def TuningSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
"Use 32-bit divide for positive values less than 2^32">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
def TuningPadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
def TuningSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
"SlowTwoMemOps", "true",
"Two memory operand instructions are slow">;
def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
def TuningSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
def TuningSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
"LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
def TuningSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
def TuningPOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
"HasPOPCNTFalseDeps", "true",
"POPCNT has a false dependency on dest register">;
def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
"HasLZCNTFalseDeps", "true",
"LZCNT/TZCNT have a false dependency on dest register">;
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def FeatureFastVariableCrossLaneShuffle
def TuningFastVariableCrossLaneShuffle
: SubtargetFeature<"fast-variable-crosslane-shuffle",
"HasFastVariableCrossLaneShuffle",
"true", "Cross-lane shuffles with variable masks are fast">;
def FeatureFastVariablePerLaneShuffle
def TuningFastVariablePerLaneShuffle
: SubtargetFeature<"fast-variable-perlane-shuffle",
"HasFastVariablePerLaneShuffle",
"true", "Per-lane shuffles with variable masks are fast">;
// On some X86 processors, a vzeroupper instruction should be inserted after
// using ymm/zmm registers before executing code that may use SSE instructions.
def FeatureInsertVZEROUPPER
def TuningInsertVZEROUPPER
: SubtargetFeature<"vzeroupper",
"InsertVZEROUPPER",
"true", "Should insert vzeroupper instructions">;
// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
// TuningFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. TuningFastVectorFSQRT should be enabled if
// vector FSQRT has higher throughput than the corresponding NR code.
// The idea is that throughput bound code is likely to be vectorized, so for
// vectorized code we should care about the throughput of SQRT operations.
// But if the code is scalar that probably means that the code has some kind of
// dependency and we should care more about reducing the latency.
def FeatureFastScalarFSQRT
def TuningFastScalarFSQRT
: SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
"true", "Scalar SQRT is fast (disable Newton-Raphson)">;
def FeatureFastVectorFSQRT
def TuningFastVectorFSQRT
: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
"true", "Vector SQRT is fast (disable Newton-Raphson)">;
// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
// be used to replace test/set sequences.
def FeatureFastLZCNT
def TuningFastLZCNT
: SubtargetFeature<
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
// If the target can efficiently decode NOPs upto 7-bytes in length.
def FeatureFast7ByteNOP
def TuningFast7ByteNOP
: SubtargetFeature<
"fast-7bytenop", "HasFast7ByteNOP", "true",
"Target can quickly decode up to 7 byte NOPs">;
// If the target can efficiently decode NOPs upto 11-bytes in length.
def FeatureFast11ByteNOP
def TuningFast11ByteNOP
: SubtargetFeature<
"fast-11bytenop", "HasFast11ByteNOP", "true",
"Target can quickly decode up to 11 byte NOPs">;
// If the target can efficiently decode NOPs upto 15-bytes in length.
def FeatureFast15ByteNOP
def TuningFast15ByteNOP
: SubtargetFeature<
"fast-15bytenop", "HasFast15ByteNOP", "true",
"Target can quickly decode up to 15 byte NOPs">;
@ -488,21 +488,21 @@ def FeatureFast15ByteNOP
// Sandy Bridge and newer processors can use SHLD with the same source on both
// inputs to implement rotate to avoid the partial flag update of the normal
// rotate instructions.
def FeatureFastSHLDRotate
def TuningFastSHLDRotate
: SubtargetFeature<
"fast-shld-rotate", "HasFastSHLDRotate", "true",
"SHLD can be used as a faster rotate">;
// Bulldozer and newer processors can merge CMP/TEST (but not other
// instructions) with conditional branches.
def FeatureBranchFusion
def TuningBranchFusion
: SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
"CMP/TEST can be fused with conditional branches">;
// Sandy Bridge and newer processors have many instructions that can be
// fused with conditional branches and pass through the CPU as a single
// operation.
def FeatureMacroFusion
def TuningMacroFusion
: SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
"Various instructions can be fused with conditional branches">;
@ -510,50 +510,50 @@ def FeatureMacroFusion
// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
// Skylake Client processor has faster Gathers than HSW and performance is
// similar to Skylake Server (AVX-512).
def FeatureFastGather
def TuningFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast">;
def FeaturePrefer128Bit
def TuningPrefer128Bit
: SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
"Prefer 128-bit AVX instructions">;
def FeaturePrefer256Bit
def TuningPrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
"Prefer 256-bit AVX instructions">;
def FeaturePreferMaskRegisters
def TuningPreferMaskRegisters
: SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
"Prefer AVX512 mask registers over PTEST/MOVMSK">;
def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
def TuningFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
"Indicates that the BEXTR instruction is implemented as a single uop "
"with good throughput">;
// Combine vector math operations with shuffles into horizontal math
// instructions if a CPU implements horizontal operations (introduced with
// SSE3) with better latency/throughput than the alternative sequence.
def FeatureFastHorizontalOps
def TuningFastHorizontalOps
: SubtargetFeature<
"fast-hops", "HasFastHorizontalOps", "true",
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
"normal vector instructions with shuffles">;
def FeatureFastScalarShiftMasks
def TuningFastScalarShiftMasks
: SubtargetFeature<
"fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
"Prefer a left/right scalar logical shift pair over a shift+and pair">;
def FeatureFastVectorShiftMasks
def TuningFastVectorShiftMasks
: SubtargetFeature<
"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
"Prefer a left/right vector logical shift pair over a shift+and pair">;
def FeatureFastMOVBE
def TuningFastMOVBE
: SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
"Prefer a movbe over a single-use load + bswap / single-use bswap + store">;
def FeatureUseGLMDivSqrtCosts
def TuningUseGLMDivSqrtCosts
: SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
"Use Goldmont specific floating point div/sqrt costs">;
@ -631,8 +631,8 @@ def ProcessorFeatures {
// Nehalem
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningInsertVZEROUPPER];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@ -644,15 +644,15 @@ def ProcessorFeatures {
list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
FeatureXSAVE,
FeatureXSAVEOPT];
list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureSlowUAMem32,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> SNBTuning = [TuningMacroFusion,
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningSlowUAMem32,
TuningFastScalarFSQRT,
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);
@ -673,17 +673,17 @@ def ProcessorFeatures {
FeatureINVPCID,
FeatureLZCNT,
FeatureMOVBE];
list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
FeatureFastVariableCrossLaneShuffle,
FeatureFastVariablePerLaneShuffle,
FeaturePOPCNTFalseDeps,
FeatureLZCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> HSWTuning = [TuningMacroFusion,
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningFastScalarFSQRT,
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
TuningInsertVZEROUPPER];
list<SubtargetFeature> HSWFeatures =
!listconcat(IVBFeatures, HSWAdditionalFeatures);
@ -700,18 +700,18 @@ def ProcessorFeatures {
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT];
list<SubtargetFeature> SKLTuning = [FeatureFastGather,
FeatureMacroFusion,
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureFastScalarFSQRT,
FeatureFastVectorFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
FeatureFastVariableCrossLaneShuffle,
FeatureFastVariablePerLaneShuffle,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKLTuning = [TuningFastGather,
TuningMacroFusion,
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningFastScalarFSQRT,
TuningFastVectorFSQRT,
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
@ -727,19 +727,19 @@ def ProcessorFeatures {
FeatureVLX,
FeaturePKU,
FeatureCLWB];
list<SubtargetFeature> SKXTuning = [FeatureFastGather,
FeatureMacroFusion,
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureFastScalarFSQRT,
FeatureFastVectorFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
FeatureFastVariableCrossLaneShuffle,
FeatureFastVariablePerLaneShuffle,
FeaturePrefer256Bit,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKXTuning = [TuningFastGather,
TuningMacroFusion,
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningFastScalarFSQRT,
TuningFastVectorFSQRT,
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@ -765,18 +765,18 @@ def ProcessorFeatures {
FeatureVBMI,
FeatureIFMA,
FeatureSHA];
list<SubtargetFeature> CNLTuning = [FeatureFastGather,
FeatureMacroFusion,
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureFastScalarFSQRT,
FeatureFastVectorFSQRT,
FeatureFastSHLDRotate,
FeatureFast15ByteNOP,
FeatureFastVariableCrossLaneShuffle,
FeatureFastVariablePerLaneShuffle,
FeaturePrefer256Bit,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> CNLTuning = [TuningFastGather,
TuningMacroFusion,
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningFastScalarFSQRT,
TuningFastVectorFSQRT,
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningInsertVZEROUPPER];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@ -846,14 +846,14 @@ def ProcessorFeatures {
FeatureMOVBE,
FeatureLAHFSAHF];
list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
FeatureSlowUAMem16,
FeatureLEAForSP,
FeatureSlowDivide32,
FeatureSlowDivide64,
FeatureSlowTwoMemOps,
FeatureLEAUsesAG,
FeaturePadShortFunctions,
FeatureInsertVZEROUPPER];
TuningSlowUAMem16,
TuningLEAForSP,
TuningSlowDivide32,
TuningSlowDivide64,
TuningSlowTwoMemOps,
TuningLEAUsesAG,
TuningPadShortFunctions,
TuningInsertVZEROUPPER];
// Silvermont
list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
@ -862,15 +862,15 @@ def ProcessorFeatures {
FeaturePRFCHW,
FeatureRDRAND];
list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
FeatureSlowTwoMemOps,
FeatureSlowLEA,
FeatureSlowIncDec,
FeatureSlowDivide64,
FeatureSlowPMULLD,
FeatureFast7ByteNOP,
FeatureFastMOVBE,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
TuningSlowTwoMemOps,
TuningSlowLEA,
TuningSlowIncDec,
TuningSlowDivide64,
TuningSlowPMULLD,
TuningFast7ByteNOP,
TuningFastMOVBE,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
list<SubtargetFeature> SLMFeatures =
!listconcat(AtomFeatures, SLMAdditionalFeatures);
@ -884,25 +884,25 @@ def ProcessorFeatures {
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureFSGSBase];
list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
FeatureSlowTwoMemOps,
FeatureSlowLEA,
FeatureSlowIncDec,
FeatureFastMOVBE,
FeaturePOPCNTFalseDeps,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLMTuning = [TuningUseGLMDivSqrtCosts,
TuningSlowTwoMemOps,
TuningSlowLEA,
TuningSlowIncDec,
TuningFastMOVBE,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
list<SubtargetFeature> GLMFeatures =
!listconcat(SLMFeatures, GLMAdditionalFeatures);
// Goldmont Plus
list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
FeatureRDPID];
list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
FeatureSlowTwoMemOps,
FeatureSlowLEA,
FeatureSlowIncDec,
FeatureFastMOVBE,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLPTuning = [TuningUseGLMDivSqrtCosts,
TuningSlowTwoMemOps,
TuningSlowLEA,
TuningSlowIncDec,
TuningFastMOVBE,
TuningInsertVZEROUPPER];
list<SubtargetFeature> GLPFeatures =
!listconcat(GLMFeatures, GLPAdditionalFeatures);
@ -969,14 +969,14 @@ def ProcessorFeatures {
FeatureBMI2,
FeatureFMA,
FeaturePRFCHW];
list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
FeatureSlow3OpsLEA,
FeatureSlowIncDec,
FeatureSlowTwoMemOps,
FeaturePreferMaskRegisters,
FeatureFastGather,
FeatureFastMOVBE,
FeatureSlowPMADDWD];
list<SubtargetFeature> KNLTuning = [TuningSlowDivide64,
TuningSlow3OpsLEA,
TuningSlowIncDec,
TuningSlowTwoMemOps,
TuningPreferMaskRegisters,
TuningFastGather,
TuningFastMOVBE,
TuningSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
list<SubtargetFeature> KNMFeatures =
!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
@ -995,9 +995,9 @@ def ProcessorFeatures {
FeatureLAHFSAHF,
FeatureCMOV,
Feature64Bit];
list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
FeatureSlowSHLD,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
TuningSlowSHLD,
TuningInsertVZEROUPPER];
// Bobcat
list<SubtargetFeature> BtVer1Features = [FeatureX87,
@ -1014,11 +1014,11 @@ def ProcessorFeatures {
FeatureLZCNT,
FeaturePOPCNT,
FeatureLAHFSAHF];
list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
FeatureFastScalarShiftMasks,
FeatureFastVectorShiftMasks,
FeatureSlowSHLD,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningSlowSHLD,
TuningInsertVZEROUPPER];
// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@ -1029,14 +1029,14 @@ def ProcessorFeatures {
FeatureMOVBE,
FeatureXSAVE,
FeatureXSAVEOPT];
list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
FeatureFastBEXTR,
FeatureFastHorizontalOps,
FeatureFast15ByteNOP,
FeatureFastScalarShiftMasks,
FeatureFastVectorShiftMasks,
FeatureFastMOVBE,
FeatureSlowSHLD];
list<SubtargetFeature> BtVer2Tuning = [TuningFastLZCNT,
TuningFastBEXTR,
TuningFastHorizontalOps,
TuningFast15ByteNOP,
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningFastMOVBE,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@ -1058,19 +1058,19 @@ def ProcessorFeatures {
FeatureXSAVE,
FeatureLWP,
FeatureLAHFSAHF];
list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
FeatureFast11ByteNOP,
FeatureFastScalarShiftMasks,
FeatureBranchFusion,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
TuningFast11ByteNOP,
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningInsertVZEROUPPER];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
FeatureBMI,
FeatureTBM,
FeatureFMA];
list<SubtargetFeature> BdVer2AdditionalTuning = [FeatureFastBEXTR,
FeatureFastMOVBE];
list<SubtargetFeature> BdVer2AdditionalTuning = [TuningFastBEXTR,
TuningFastMOVBE];
list<SubtargetFeature> BdVer2Tuning =
!listconcat(BdVer1Tuning, BdVer2AdditionalTuning);
list<SubtargetFeature> BdVer2Features =
@ -1127,14 +1127,14 @@ def ProcessorFeatures {
FeatureXSAVEC,
FeatureXSAVEOPT,
FeatureXSAVES];
list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
FeatureFastBEXTR,
FeatureFast15ByteNOP,
FeatureBranchFusion,
FeatureFastScalarShiftMasks,
FeatureFastMOVBE,
FeatureSlowSHLD,
FeatureInsertVZEROUPPER];
list<SubtargetFeature> ZNTuning = [TuningFastLZCNT,
TuningFastBEXTR,
TuningFast15ByteNOP,
TuningBranchFusion,
TuningFastScalarShiftMasks,
TuningFastMOVBE,
TuningSlowSHLD,
TuningInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureWBNOINVD];
@ -1147,8 +1147,8 @@ def ProcessorFeatures {
FeatureVAES,
FeatureVPCLMULQDQ];
list<SubtargetFeature> ZN3AdditionalTuning =
[FeatureMacroFusion,
FeatureFastVariablePerLaneShuffle];
[TuningMacroFusion,
TuningFastVariablePerLaneShuffle];
list<SubtargetFeature> ZN3Tuning =
!listconcat(ZNTuning, ZN3AdditionalTuning);
list<SubtargetFeature> ZN3Features =
@ -1175,37 +1175,37 @@ class ProcModel<string Name, SchedMachineModel Model,
// It has no effect on code generation.
def : ProcModel<"generic", SandyBridgeModel,
[FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
[FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureSlowIncDec,
FeatureMacroFusion,
FeatureInsertVZEROUPPER]>;
[TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningSlowIncDec,
TuningMacroFusion,
TuningInsertVZEROUPPER]>;
def : Proc<"i386", [FeatureX87],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"i486", [FeatureX87],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
FeatureNOPL],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
FeatureFXSR, FeatureNOPL],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
foreach P = ["pentium3", "pentium3m"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@ -1221,30 +1221,30 @@ foreach P = ["pentium3", "pentium3m"] in {
def : ProcModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcModel<P, GenericPostRAModel,
[FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
// Intel Quark.
def : Proc<"lakemont", [FeatureCMPXCHG8B],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
// Intel Core Duo.
def : ProcModel<"yonah", SandyBridgeModel,
[FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
// NetBurst.
def : ProcModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : ProcModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureCMPXCHG8B,
@ -1257,8 +1257,8 @@ def : ProcModel<"nocona", GenericPostRAModel, [
FeatureCMPXCHG16B,
],
[
FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
TuningSlowUAMem16,
TuningInsertVZEROUPPER
]>;
// Intel Core 2 Solo/Duo.
@ -1275,9 +1275,9 @@ def : ProcModel<"core2", SandyBridgeModel, [
FeatureLAHFSAHF
],
[
FeatureMacroFusion,
FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
TuningMacroFusion,
TuningSlowUAMem16,
TuningInsertVZEROUPPER
]>;
def : ProcModel<"penryn", SandyBridgeModel, [
FeatureX87,
@ -1292,9 +1292,9 @@ def : ProcModel<"penryn", SandyBridgeModel, [
FeatureLAHFSAHF
],
[
FeatureMacroFusion,
FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
TuningMacroFusion,
TuningSlowUAMem16,
TuningInsertVZEROUPPER
]>;
// Atom CPUs.
@ -1379,37 +1379,37 @@ def : ProcModel<"alderlake", SkylakeClientModel,
// AMD CPUs.
def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
foreach P = ["athlon", "athlon-tbird"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
FeatureNOPL],
[FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
[FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
[FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
FeatureInsertVZEROUPPER]>;
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
TuningInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
Feature64Bit],
[FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
FeatureInsertVZEROUPPER]>;
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
TuningInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
@ -1445,17 +1445,17 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
ProcessorFeatures.ZN3Tuning>;
def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureMMX],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"winchip2", [FeatureX87, Feature3DNow],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"c3", [FeatureX87, Feature3DNow],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
FeatureSSE1, FeatureFXSR, FeatureCMOV],
[FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@ -1469,11 +1469,11 @@ def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
// forming a common base for them.
def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
[
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureSlowIncDec,
FeatureMacroFusion,
FeatureInsertVZEROUPPER
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningSlowIncDec,
TuningMacroFusion,
TuningInsertVZEROUPPER
]>;
// x86-64 micro-architecture levels.

View File

@ -1094,11 +1094,11 @@ static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
if (Subtarget->is64Bit()) {
// FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
// IndexReg/BaseReg below need to be updated.
if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
if (Subtarget->hasFeature(X86::TuningFast7ByteNOP))
MaxNopLength = 7;
else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
else if (Subtarget->hasFeature(X86::TuningFast15ByteNOP))
MaxNopLength = 15;
else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
else if (Subtarget->hasFeature(X86::TuningFast11ByteNOP))
MaxNopLength = 11;
else
MaxNopLength = 10;

View File

@ -45,48 +45,50 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureCMPXCHG16B,
X86::FeatureLAHFSAHF,
// Codegen control options.
X86::FeatureFast11ByteNOP,
X86::FeatureFast15ByteNOP,
X86::FeatureFastBEXTR,
X86::FeatureFastHorizontalOps,
X86::FeatureFastLZCNT,
X86::FeatureFastScalarFSQRT,
X86::FeatureFastSHLDRotate,
X86::FeatureFastScalarShiftMasks,
X86::FeatureFastVectorShiftMasks,
X86::FeatureFastVariableCrossLaneShuffle,
X86::FeatureFastVariablePerLaneShuffle,
X86::FeatureFastVectorFSQRT,
X86::FeatureLEAForSP,
X86::FeatureLEAUsesAG,
X86::FeatureLZCNTFalseDeps,
X86::FeatureBranchFusion,
X86::FeatureMacroFusion,
X86::FeaturePadShortFunctions,
X86::FeaturePOPCNTFalseDeps,
// Some older targets can be setup to fold unaligned loads.
X86::FeatureSSEUnalignedMem,
X86::FeatureSlow3OpsLEA,
X86::FeatureSlowDivide32,
X86::FeatureSlowDivide64,
X86::FeatureSlowIncDec,
X86::FeatureSlowLEA,
X86::FeatureSlowPMADDWD,
X86::FeatureSlowPMULLD,
X86::FeatureSlowSHLD,
X86::FeatureSlowTwoMemOps,
X86::FeatureSlowUAMem16,
X86::FeaturePreferMaskRegisters,
X86::FeatureInsertVZEROUPPER,
X86::FeatureUseGLMDivSqrtCosts,
// Codegen control options.
X86::TuningFast11ByteNOP,
X86::TuningFast15ByteNOP,
X86::TuningFastBEXTR,
X86::TuningFastHorizontalOps,
X86::TuningFastLZCNT,
X86::TuningFastScalarFSQRT,
X86::TuningFastSHLDRotate,
X86::TuningFastScalarShiftMasks,
X86::TuningFastVectorShiftMasks,
X86::TuningFastVariableCrossLaneShuffle,
X86::TuningFastVariablePerLaneShuffle,
X86::TuningFastVectorFSQRT,
X86::TuningLEAForSP,
X86::TuningLEAUsesAG,
X86::TuningLZCNTFalseDeps,
X86::TuningBranchFusion,
X86::TuningMacroFusion,
X86::TuningPadShortFunctions,
X86::TuningPOPCNTFalseDeps,
X86::TuningSlow3OpsLEA,
X86::TuningSlowDivide32,
X86::TuningSlowDivide64,
X86::TuningSlowIncDec,
X86::TuningSlowLEA,
X86::TuningSlowPMADDWD,
X86::TuningSlowPMULLD,
X86::TuningSlowSHLD,
X86::TuningSlowTwoMemOps,
X86::TuningSlowUAMem16,
X86::TuningPreferMaskRegisters,
X86::TuningInsertVZEROUPPER,
X86::TuningUseGLMDivSqrtCosts,
// Perf-tuning flags.
X86::FeatureFastGather,
X86::FeatureSlowUAMem32,
X86::TuningFastGather,
X86::TuningSlowUAMem32,
// Based on whether user set the -mprefer-vector-width command line.
X86::FeaturePrefer128Bit,
X86::FeaturePrefer256Bit,
X86::TuningPrefer128Bit,
X86::TuningPrefer256Bit,
// CPU name enums. These just follow CPU string.
X86::ProcIntelAtom,