forked from OSchip/llvm-project
[X86] Emit 11-byte or 15-byte NOPs on recent AMD targets, else default to 10-byte NOPs (PR22965)
We currently emit up to 15-byte NOPs on all targets (apart from Silvermont), which stalls performance on some targets with decoders that struggle with 2 or 3 more '66' prefixes. This patch flags recent AMD targets (btver1/znver1) to still emit 15-byte NOPs and bdver* targets to emit 11-byte NOPs. All other targets now emit 10-byte NOPs apart from SilverMont CPUs which still emit 7-byte NOPS. Differential Revision: https://reviews.llvm.org/D42616 llvm-svn: 323693
This commit is contained in:
parent
08464524c3
commit
02bdac53e7
|
@ -344,10 +344,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
uint64_t MaxNopLength = STI.getFeatureBits()[X86::ProcIntelSLM] ? 7 : 15;
|
||||
// 15-bytes is the longest single NOP instruction, but 10-bytes is
|
||||
// commonly the longest that can be efficiently decoded.
|
||||
uint64_t MaxNopLength = 10;
|
||||
if (STI.getFeatureBits()[X86::ProcIntelSLM])
|
||||
MaxNopLength = 7;
|
||||
else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
|
||||
MaxNopLength = 15;
|
||||
else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
|
||||
MaxNopLength = 11;
|
||||
|
||||
// 15 is the longest single nop instruction. Emit as many 15-byte nops as
|
||||
// needed, then emit a nop of the remaining length.
|
||||
// Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
|
||||
// length.
|
||||
do {
|
||||
const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
|
||||
const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
|
||||
|
|
|
@ -305,8 +305,16 @@ def FeatureFastLZCNT
|
|||
: SubtargetFeature<
|
||||
"fast-lzcnt", "HasFastLZCNT", "true",
|
||||
"LZCNT instructions are as fast as most simple integer ops">;
|
||||
|
||||
|
||||
// If the target can efficiently decode NOPs upto 11-bytes in length.
|
||||
def FeatureFast11ByteNOP
|
||||
: SubtargetFeature<
|
||||
"fast-11bytenop", "HasFast11ByteNOP", "true",
|
||||
"Target can quickly decode up to 11 byte NOPs">;
|
||||
// If the target can efficiently decode NOPs upto 15-bytes in length.
|
||||
def FeatureFast15ByteNOP
|
||||
: SubtargetFeature<
|
||||
"fast-15bytenop", "HasFast15ByteNOP", "true",
|
||||
"Target can quickly decode up to 15 byte NOPs">;
|
||||
// Sandy Bridge and newer processors can use SHLD with the same source on both
|
||||
// inputs to implement rotate to avoid the partial flag update of the normal
|
||||
// rotate instructions.
|
||||
|
@ -849,7 +857,8 @@ def : Proc<"btver1", [
|
|||
FeatureLZCNT,
|
||||
FeaturePOPCNT,
|
||||
FeatureSlowSHLD,
|
||||
FeatureLAHFSAHF
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast15ByteNOP
|
||||
]>;
|
||||
|
||||
// Jaguar
|
||||
|
@ -874,6 +883,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
|
|||
FeatureXSAVEOPT,
|
||||
FeatureSlowSHLD,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast15ByteNOP,
|
||||
FeatureFastPartialYMMorZMMWrite
|
||||
]>;
|
||||
|
||||
|
@ -897,6 +907,7 @@ def : Proc<"bdver1", [
|
|||
FeatureLWP,
|
||||
FeatureSlowSHLD,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast11ByteNOP,
|
||||
FeatureMacroFusion
|
||||
]>;
|
||||
// Piledriver
|
||||
|
@ -923,6 +934,7 @@ def : Proc<"bdver2", [
|
|||
FeatureFMA,
|
||||
FeatureSlowSHLD,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast11ByteNOP,
|
||||
FeatureMacroFusion
|
||||
]>;
|
||||
|
||||
|
@ -952,6 +964,7 @@ def : Proc<"bdver3", [
|
|||
FeatureSlowSHLD,
|
||||
FeatureFSGSBase,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast11ByteNOP,
|
||||
FeatureMacroFusion
|
||||
]>;
|
||||
|
||||
|
@ -981,6 +994,7 @@ def : Proc<"bdver4", [
|
|||
FeatureSlowSHLD,
|
||||
FeatureFSGSBase,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast11ByteNOP,
|
||||
FeatureMWAITX,
|
||||
FeatureMacroFusion
|
||||
]>;
|
||||
|
@ -1003,6 +1017,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
|
|||
FeatureFastLZCNT,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureLZCNT,
|
||||
FeatureFast15ByteNOP,
|
||||
FeatureMacroFusion,
|
||||
FeatureMMX,
|
||||
FeatureMOVBE,
|
||||
|
|
|
@ -335,6 +335,8 @@ void X86Subtarget::initializeEnvironment() {
|
|||
HasLZCNTFalseDeps = false;
|
||||
HasFastVariableShuffle = false;
|
||||
HasFastPartialYMMorZMMWrite = false;
|
||||
HasFast11ByteNOP = false;
|
||||
HasFast15ByteNOP = false;
|
||||
HasFastGather = false;
|
||||
HasFastScalarFSQRT = false;
|
||||
HasFastVectorFSQRT = false;
|
||||
|
|
|
@ -246,6 +246,14 @@ protected:
|
|||
/// of a YMM or ZMM register without clearing the upper part.
|
||||
bool HasFastPartialYMMorZMMWrite;
|
||||
|
||||
/// True if there is no performance penalty for writing NOPs with up to
|
||||
/// 11 bytes.
|
||||
bool HasFast11ByteNOP;
|
||||
|
||||
/// True if there is no performance penalty for writing NOPs with up to
|
||||
/// 15 bytes.
|
||||
bool HasFast15ByteNOP;
|
||||
|
||||
/// True if gather is reasonably fast. This is true for Skylake client and
|
||||
/// all AVX-512 CPUs.
|
||||
bool HasFastGather;
|
||||
|
|
|
@ -202,15 +202,15 @@ f0:
|
|||
// CHECK: 0090: C3000000 00000000 00000000 00000000 |................|
|
||||
// CHECK: 00A0: C3C3C3C3 C3C3C366 0F1F8400 00000000 |.......f........|
|
||||
// CHECK: 00B0: C3000000 00000000 00000000 00000000 |................|
|
||||
// CHECK: 00C0: C3C3C3C3 C366662E 0F1F8400 00000000 |.....ff.........|
|
||||
// CHECK: 00C0: C3C3C3C3 C3662E0F 1F840000 00000090 |.....f..........|
|
||||
// CHECK: 00D0: C3000000 00000000 00000000 00000000 |................|
|
||||
// CHECK: 00E0: C3C3C3C3 6666662E 0F1F8400 00000000 |....fff.........|
|
||||
// CHECK: 00E0: C3C3C3C3 662E0F1F 84000000 00006690 |....f.........f.|
|
||||
// CHECK: 00F0: C3000000 00000000 00000000 00000000 |................|
|
||||
// CHECK: 0100: C3C3C366 6666662E 0F1F8400 00000000 |...ffff.........|
|
||||
// CHECK: 0100: C3C3C366 2E0F1F84 00000000 000F1F00 |...f............|
|
||||
// CHECK: 0110: C3000000 00000000 00000000 00000000 |................|
|
||||
// CHECK: 0120: C3C36666 6666662E 0F1F8400 00000000 |..fffff.........|
|
||||
// CHECK: 0120: C3C3662E 0F1F8400 00000000 0F1F4000 |..f...........@.|
|
||||
// CHECK: 0130: C3000000 00000000 00000000 00000000 |................|
|
||||
// CHECK: 0140: C3666666 6666662E 0F1F8400 00000000 |.ffffff.........|
|
||||
// CHECK: 0140: C3662E0F 1F840000 0000000F 1F440000 |.f...........D..|
|
||||
// CHECK: 0150: C3 |.|
|
||||
// CHECK: )
|
||||
// CHECK: }
|
||||
|
@ -255,7 +255,7 @@ f0:
|
|||
// CHECK: }
|
||||
// CHECK: Segment {
|
||||
// CHECK: Cmd: LC_SEGMENT
|
||||
// CHECK: Name:
|
||||
// CHECK: Name:
|
||||
// CHECK: Size: 192
|
||||
// CHECK: vmaddr: 0x0
|
||||
// CHECK: vmsize: 0x174
|
||||
|
|
|
@ -13,17 +13,19 @@ foo:
|
|||
.bundle_lock align_to_end
|
||||
callq bar
|
||||
.bundle_unlock
|
||||
# To align this group to a bundle end, we need a 15-byte NOP and a 12-byte NOP.
|
||||
# To align this group to a bundle end, we need a two 10-byte NOPs and a 7-byte NOP.
|
||||
# CHECK: 0: nop
|
||||
# CHECK-NEXT: f: nop
|
||||
# CHECK-NEXT: a: nop
|
||||
# CHECK-NEXT: 14: nop
|
||||
# CHECK: 1b: callq
|
||||
|
||||
# This push instruction is 1 byte long
|
||||
.bundle_lock align_to_end
|
||||
push %rax
|
||||
.bundle_unlock
|
||||
# To align this group to a bundle end, we need two 15-byte NOPs, and a 1-byte.
|
||||
# To align this group to a bundle end, we need three 10-byte NOPs, and a 1-byte.
|
||||
# CHECK: 20: nop
|
||||
# CHECK-NEXT: 2f: nop
|
||||
# CHECK-NEXT: 2a: nop
|
||||
# CHECK-NEXT: 34: nop
|
||||
# CHECK-NEXT: 3e: nop
|
||||
# CHECK-NEXT: 3f: pushq
|
||||
|
|
|
@ -13,9 +13,9 @@ foo:
|
|||
.bundle_lock align_to_end
|
||||
# CHECK: 1: nopw %cs:(%eax,%eax)
|
||||
# CHECK: 10: nopw %cs:(%eax,%eax)
|
||||
# CHECK-RELAX: 1f: nop
|
||||
# CHECK-RELAX: 1a: nop
|
||||
# CHECK-RELAX: 20: nopw %cs:(%eax,%eax)
|
||||
# CHECK-RELAX: 2f: nopw %cs:(%eax,%eax)
|
||||
# CHECK-RELAX: 2a: nopw %cs:(%eax,%eax)
|
||||
# CHECK-OPT: 1b: calll -4
|
||||
# CHECK-RELAX: 3b: calll -4
|
||||
calll bar # 5 bytes
|
||||
|
|
|
@ -12,7 +12,7 @@ foo:
|
|||
.align 16
|
||||
# CHECK: 1: nopw %cs:(%eax,%eax)
|
||||
# CHECK-RELAX: 10: nopw %cs:(%eax,%eax)
|
||||
# CHECK-RELAX: 1f: nop
|
||||
# CHECK-RELAX: 1a: nop
|
||||
# CHECK-OPT: 10: movl $1, (%esp)
|
||||
# CHECK-RELAX: 20: movl $1, (%esp)
|
||||
movl $0x1, (%esp) # 7 bytes
|
||||
|
|
|
@ -38,9 +38,10 @@ foo:
|
|||
callq bar
|
||||
callq bar
|
||||
.bundle_unlock
|
||||
# And here we'll need a 11-byte NOP
|
||||
# And here we'll need a 10-byte NOP + 1-byte NOP
|
||||
# CHECK: 30: callq
|
||||
# CHECK: 35: nop
|
||||
# CHECK: 3f: nop
|
||||
# CHECK-NEXT: 40: callq
|
||||
# CHECK-NEXT: 45: callq
|
||||
|
||||
|
|
|
@ -1,30 +1,51 @@
|
|||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=silvermont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=NOP1 %s
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=bdver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP11
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=bdver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP11
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=btver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=btver2 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
|
||||
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
|
||||
|
||||
# Ensure alignment directives also emit sequences of 15-byte NOPs on processors
|
||||
# Ensure alignment directives also emit sequences of 10, 11 and 15-byte NOPs on processors
|
||||
# capable of using long NOPs.
|
||||
inc %eax
|
||||
.p2align 5
|
||||
inc %eax
|
||||
# CHECK: 0: inc
|
||||
# CHECK-NEXT: 1: nop
|
||||
# CHECK-NEXT: 10: nop
|
||||
# CHECK-NEXT: 1f: nop
|
||||
# CHECK-NEXT: 20: inc
|
||||
# LNOP15: 0: inc
|
||||
# LNOP15-NEXT: 1: nop
|
||||
# LNOP15-NEXT: 10: nop
|
||||
# LNOP15-NEXT: 1f: nop
|
||||
# LNOP15-NEXT: 20: inc
|
||||
|
||||
# LNOP11: 0: inc
|
||||
# LNOP11-NEXT: 1: nop
|
||||
# LNOP11-NEXT: c: nop
|
||||
# LNOP11-NEXT: 17: nop
|
||||
# LNOP11-NEXT: 20: inc
|
||||
|
||||
# LNOP10: 0: inc
|
||||
# LNOP10-NEXT: 1: nop
|
||||
# LNOP10-NEXT: b: nop
|
||||
# LNOP10-NEXT: 15: nop
|
||||
# LNOP10-NEXT: 1f: nop
|
||||
# LNOP10-NEXT: 20: inc
|
||||
|
||||
# On Silvermont we emit only 7 byte NOPs since longer NOPs are not profitable.
|
||||
# LNOP7: 0: inc
|
||||
# LNOP7-NEXT: 1: nop
|
||||
# LNOP7-NEXT: 8: nop
|
||||
# LNOP7-NEXT: f: nop
|
||||
# LNOP7-NEXT: 16: nop
|
||||
# LNOP7-NEXT: 1d: nop
|
||||
# LNOP7-NEXT: 20: inc
|
||||
# LNOP7-NEXT: 16: nop
|
||||
# LNOP7-NEXT: 1d: nop
|
||||
# LNOP7-NEXT: 20: inc
|
||||
|
||||
# On Lakemont we emit only 1 byte NOPs since longer NOPs are not supported/legal
|
||||
# NOP1: 0: inc
|
||||
|
|
Loading…
Reference in New Issue