[X86] Emit 11-byte or 15-byte NOPs on recent AMD targets, else default to 10-byte NOPs (PR22965)

We currently emit up to 15-byte NOPs on all targets (apart from Silvermont), which stalls performance on some targets with decoders that struggle with 2 or 3 more '66' prefixes. This patch flags recent AMD targets (btver1/znver1) to still emit 15-byte NOPs and bdver* targets to emit 11-byte NOPs. All other targets now emit 10-byte NOPs apart from SilverMont CPUs which still emit 7-byte NOPS. Differential Revision: https://reviews.llvm.org/D42616 llvm-svn: 323693
2018-01-29 21:24:31 +00:00 · 2018-01-29 21:24:31 +00:00 · 02bdac53e7
parent 08464524c3
commit 02bdac53e7
10 changed files with 90 additions and 33 deletions
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@ -344,10 +344,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
    return true;
  }

-  uint64_t MaxNopLength = STI.getFeatureBits()[X86::ProcIntelSLM] ? 7 : 15;
+  // 15-bytes is the longest single NOP instruction, but 10-bytes is
+  // commonly the longest that can be efficiently decoded.
+  uint64_t MaxNopLength = 10;
+  if (STI.getFeatureBits()[X86::ProcIntelSLM])
+    MaxNopLength = 7;
+  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+    MaxNopLength = 15;
+  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+    MaxNopLength = 11;

-  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
-  // needed, then emit a nop of the remaining length.
+  // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
+  // length.
  do {
    const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@ -305,8 +305,16 @@ def FeatureFastLZCNT
    : SubtargetFeature<
          "fast-lzcnt", "HasFastLZCNT", "true",
          "LZCNT instructions are as fast as most simple integer ops">;
-
-
+// If the target can efficiently decode NOPs upto 11-bytes in length.
+def FeatureFast11ByteNOP
+    : SubtargetFeature<
+          "fast-11bytenop", "HasFast11ByteNOP", "true",
+          "Target can quickly decode up to 11 byte NOPs">;
+// If the target can efficiently decode NOPs upto 15-bytes in length.
+def FeatureFast15ByteNOP
+    : SubtargetFeature<
+          "fast-15bytenop", "HasFast15ByteNOP", "true",
+          "Target can quickly decode up to 15 byte NOPs">;
 // Sandy Bridge and newer processors can use SHLD with the same source on both
 // inputs to implement rotate to avoid the partial flag update of the normal
 // rotate instructions.
@ -849,7 +857,8 @@ def : Proc<"btver1", [
  FeatureLZCNT,
  FeaturePOPCNT,
  FeatureSlowSHLD,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFast15ByteNOP
 ]>;

 // Jaguar
@ -874,6 +883,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
  FeatureXSAVEOPT,
  FeatureSlowSHLD,
  FeatureLAHFSAHF,
+  FeatureFast15ByteNOP,
  FeatureFastPartialYMMorZMMWrite
 ]>;

@ -897,6 +907,7 @@ def : Proc<"bdver1", [
  FeatureLWP,
  FeatureSlowSHLD,
  FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
  FeatureMacroFusion
 ]>;
 // Piledriver
@ -923,6 +934,7 @@ def : Proc<"bdver2", [
  FeatureFMA,
  FeatureSlowSHLD,
  FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
  FeatureMacroFusion
 ]>;

@ -952,6 +964,7 @@ def : Proc<"bdver3", [
  FeatureSlowSHLD,
  FeatureFSGSBase,
  FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
  FeatureMacroFusion
 ]>;

@ -981,6 +994,7 @@ def : Proc<"bdver4", [
  FeatureSlowSHLD,
  FeatureFSGSBase,
  FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
  FeatureMWAITX,
  FeatureMacroFusion
 ]>;
@ -1003,6 +1017,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
  FeatureFastLZCNT,
  FeatureLAHFSAHF,
  FeatureLZCNT,
+  FeatureFast15ByteNOP,
  FeatureMacroFusion,
  FeatureMMX,
  FeatureMOVBE,
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@ -335,6 +335,8 @@ void X86Subtarget::initializeEnvironment() {
  HasLZCNTFalseDeps = false;
  HasFastVariableShuffle = false;
  HasFastPartialYMMorZMMWrite = false;
+  HasFast11ByteNOP = false;
+  HasFast15ByteNOP = false;
  HasFastGather = false;
  HasFastScalarFSQRT = false;
  HasFastVectorFSQRT = false;
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@ -246,6 +246,14 @@ protected:
  /// of a YMM or ZMM register without clearing the upper part.
  bool HasFastPartialYMMorZMMWrite;

+  /// True if there is no performance penalty for writing NOPs with up to
+  /// 11 bytes.
+  bool HasFast11ByteNOP;
+
+  /// True if there is no performance penalty for writing NOPs with up to
+  /// 15 bytes.
+  bool HasFast15ByteNOP;
+
  /// True if gather is reasonably fast. This is true for Skylake client and
  /// all AVX-512 CPUs.
  bool HasFastGather;
--- a/llvm/test/MC/MachO/x86_32-optimal_nop.s
+++ b/llvm/test/MC/MachO/x86_32-optimal_nop.s
@ -202,15 +202,15 @@ f0:
 // CHECK:       0090: C3000000 00000000 00000000 00000000  |................|
 // CHECK:       00A0: C3C3C3C3 C3C3C366 0F1F8400 00000000  |.......f........|
 // CHECK:       00B0: C3000000 00000000 00000000 00000000  |................|
-// CHECK:       00C0: C3C3C3C3 C366662E 0F1F8400 00000000  |.....ff.........|
+// CHECK:       00C0: C3C3C3C3 C3662E0F 1F840000 00000090  |.....f..........|
 // CHECK:       00D0: C3000000 00000000 00000000 00000000  |................|
-// CHECK:       00E0: C3C3C3C3 6666662E 0F1F8400 00000000  |....fff.........|
+// CHECK:       00E0: C3C3C3C3 662E0F1F 84000000 00006690  |....f.........f.|
 // CHECK:       00F0: C3000000 00000000 00000000 00000000  |................|
-// CHECK:       0100: C3C3C366 6666662E 0F1F8400 00000000  |...ffff.........|
+// CHECK:       0100: C3C3C366 2E0F1F84 00000000 000F1F00  |...f............|
 // CHECK:       0110: C3000000 00000000 00000000 00000000  |................|
-// CHECK:       0120: C3C36666 6666662E 0F1F8400 00000000  |..fffff.........|
+// CHECK:       0120: C3C3662E 0F1F8400 00000000 0F1F4000  |..f...........@.|
 // CHECK:       0130: C3000000 00000000 00000000 00000000  |................|
-// CHECK:       0140: C3666666 6666662E 0F1F8400 00000000  |.ffffff.........|
+// CHECK:       0140: C3662E0F 1F840000 0000000F 1F440000  |.f...........D..|
 // CHECK:       0150: C3                                   |.|
 // CHECK:     )
 // CHECK:   }
@ -255,7 +255,7 @@ f0:
 // CHECK: }
 // CHECK: Segment {
 // CHECK:   Cmd: LC_SEGMENT
-// CHECK:   Name: 
+// CHECK:   Name:
 // CHECK:   Size: 192
 // CHECK:   vmaddr: 0x0
 // CHECK:   vmsize: 0x174
--- a/llvm/test/MC/X86/AlignedBundling/long-nop-pad.s
+++ b/llvm/test/MC/X86/AlignedBundling/long-nop-pad.s
@ -13,17 +13,19 @@ foo:
  .bundle_lock align_to_end
  callq   bar
  .bundle_unlock
-# To align this group to a bundle end, we need a 15-byte NOP and a 12-byte NOP.
+# To align this group to a bundle end, we need a two 10-byte NOPs and a 7-byte NOP.
 # CHECK:        0:  nop
-# CHECK-NEXT:   f:  nop
+# CHECK-NEXT:   a:  nop
+# CHECK-NEXT:   14: nop
 # CHECK:   1b: callq

 # This push instruction is 1 byte long
  .bundle_lock align_to_end
  push %rax
  .bundle_unlock
-# To align this group to a bundle end, we need two 15-byte NOPs, and a 1-byte.
+# To align this group to a bundle end, we need three 10-byte NOPs, and a 1-byte.
 # CHECK:        20:  nop
-# CHECK-NEXT:   2f:  nop
+# CHECK-NEXT:   2a:  nop
+# CHECK-NEXT:   34:  nop
 # CHECK-NEXT:   3e:  nop
 # CHECK-NEXT:   3f: pushq
--- a/llvm/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
+++ b/llvm/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
@ -13,9 +13,9 @@ foo:
        .bundle_lock align_to_end
 # CHECK:            1:  nopw %cs:(%eax,%eax)
 # CHECK:            10: nopw %cs:(%eax,%eax)
-# CHECK-RELAX:      1f: nop
+# CHECK-RELAX:      1a: nop
 # CHECK-RELAX:      20: nopw %cs:(%eax,%eax)
-# CHECK-RELAX:      2f: nopw %cs:(%eax,%eax)
+# CHECK-RELAX:      2a: nopw %cs:(%eax,%eax)
 # CHECK-OPT:        1b: calll -4
 # CHECK-RELAX:      3b: calll -4
        calll   bar # 5 bytes
--- a/llvm/test/MC/X86/AlignedBundling/misaligned-bundle.s
+++ b/llvm/test/MC/X86/AlignedBundling/misaligned-bundle.s
@ -12,7 +12,7 @@ foo:
        .align  16
 # CHECK:            1:  nopw %cs:(%eax,%eax)
 # CHECK-RELAX:      10: nopw %cs:(%eax,%eax)
-# CHECK-RELAX:      1f: nop
+# CHECK-RELAX:      1a: nop
 # CHECK-OPT:        10: movl $1, (%esp)
 # CHECK-RELAX:      20: movl $1, (%esp)
        movl $0x1, (%esp)     # 7 bytes
--- a/llvm/test/MC/X86/AlignedBundling/pad-bundle-groups.s
+++ b/llvm/test/MC/X86/AlignedBundling/pad-bundle-groups.s
@ -38,9 +38,10 @@ foo:
  callq   bar
  callq   bar
  .bundle_unlock
-# And here we'll need a 11-byte NOP
+# And here we'll need a 10-byte NOP + 1-byte NOP
 # CHECK:        30: callq
 # CHECK:        35: nop
+# CHECK:        3f: nop
 # CHECK-NEXT:   40: callq
 # CHECK-NEXT:   45: callq

--- a/llvm/test/MC/X86/x86_long_nop.s
+++ b/llvm/test/MC/X86/x86_long_nop.s
@ -1,30 +1,51 @@
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP10
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=silvermont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=NOP1 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=bdver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP11
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=bdver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP11
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=btver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=btver2 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15

-# Ensure alignment directives also emit sequences of 15-byte NOPs on processors
+# Ensure alignment directives also emit sequences of 10, 11 and 15-byte NOPs on processors
 # capable of using long NOPs.
 inc %eax
 .p2align 5
 inc %eax
-# CHECK: 0:  inc
-# CHECK-NEXT: 1:  nop
-# CHECK-NEXT: 10:  nop
-# CHECK-NEXT: 1f:  nop
-# CHECK-NEXT: 20:  inc
+# LNOP15: 0:  inc
+# LNOP15-NEXT: 1:  nop
+# LNOP15-NEXT: 10: nop
+# LNOP15-NEXT: 1f: nop
+# LNOP15-NEXT: 20: inc
+
+# LNOP11: 0:  inc
+# LNOP11-NEXT: 1:  nop
+# LNOP11-NEXT: c:  nop
+# LNOP11-NEXT: 17: nop
+# LNOP11-NEXT: 20: inc
+
+# LNOP10: 0:  inc
+# LNOP10-NEXT: 1:  nop
+# LNOP10-NEXT: b:  nop
+# LNOP10-NEXT: 15: nop
+# LNOP10-NEXT: 1f: nop
+# LNOP10-NEXT: 20: inc

 # On Silvermont we emit only 7 byte NOPs since longer NOPs are not profitable.
 # LNOP7: 0:  inc
 # LNOP7-NEXT: 1:  nop
 # LNOP7-NEXT: 8:  nop
 # LNOP7-NEXT: f:  nop
-# LNOP7-NEXT: 16:  nop
-# LNOP7-NEXT: 1d:  nop
-# LNOP7-NEXT: 20:  inc
+# LNOP7-NEXT: 16: nop
+# LNOP7-NEXT: 1d: nop
+# LNOP7-NEXT: 20: inc

 # On Lakemont we emit only 1 byte NOPs since longer NOPs are not supported/legal
 # NOP1: 0:  inc