[X86] Add support for -mvzeroupper and -mno-vzeroupper to match gcc

-mvzeroupper will force the vzeroupper insertion pass to run on CPUs that normally wouldn't. -mno-vzeroupper disables it on CPUs where it normally runs. To support this with the default feature handling in clang, we need a vzeroupper feature flag in X86.td. Since this flag has the opposite polarity of the fast-partial-ymm-or-zmm-write we used to use to disable the pass, we now need to add this new flag to every CPU except KNL/KNM and BTVER2 to keep identical behavior. Remove -fast-partial-ymm-or-zmm-write which is no longer used. Differential Revision: https://reviews.llvm.org/D69786
2019-11-04 10:20:00 -08:00 · 2019-11-04 10:20:00 -08:00 · b2b6a54f84
parent 6ff439b57f
commit b2b6a54f84
9 changed files with 140 additions and 110 deletions
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -93,6 +93,10 @@ New Compiler Flags
  Clang. Setting the version to zero causes Clang to leave ``__GNUC__`` and
  other GNU-namespaced macros, such as ``__GXX_WEAK__``, undefined.

+- vzeroupper insertion on X86 targets can now be disabled with -mno-vzeroupper.
+  You can also force vzeroupper insertion to be used on CPUs that normally
+  wouldn't with -mvzeroupper.
+
 Deprecated Compiler Flags
 -------------------------

--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@ -3135,6 +3135,8 @@ def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
 def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
 def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<m_x86_Features_Group>;
 def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;
+def mvzeroupper : Flag<["-"], "mvzeroupper">, Group<m_x86_Features_Group>;
+def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group<m_x86_Features_Group>;

 // These are legacy user-facing driver-level option spellings. They are always
 // aliases for options that are spelled using the more common Unix / GNU flag
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@ -193,3 +193,8 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-enqcmd %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-ENQCMD %s
 // ENQCMD: "-target-feature" "+enqcmd"
 // NO-ENQCMD: "-target-feature" "-enqcmd"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mvzeroupper %s -### -o %t.o 2>&1 | FileCheck --check-prefix=VZEROUPPER %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-vzeroupper %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-VZEROUPPER %s
+// VZEROUPPER: "-target-feature" "+vzeroupper"
+// NO-VZEROUPPER: "-target-feature" "-vzeroupper"
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@ -134,6 +134,13 @@ Changes to the X86 Target
  Intel CPUs. This tries to limit the use of 512-bit registers which can cause a
  decrease in CPU frequency on these CPUs. This can be re-enabled by passing
  -mprefer-vector-width=512 to clang or passing -mattr=-prefer-256-bit to llc.
+* Deprecated the mpx feature flag for the Intel MPX instructions. There were no
+  intrinsics for this feature. This change only this effects the results
+  returned by getHostCPUFeatures on CPUs that implement the MPX instructions.
+* The feature flag fast-partial-ymm-or-zmm-write which previously disabled
+  vzeroupper insertion has been removed. It has been replaced with a vzeroupper
+  feature flag which has the opposite polarity. So -vzeroupper has the same
+  effect as +fast-partial-ymm-or-zmm-write.

 Changes to the AMDGPU Target
 -----------------------------
@ -143,10 +150,6 @@ Changes to the AVR Target

 During this release ...

-* Deprecated the mpx feature flag for the Intel MPX instructions. There were no
-  intrinsics for this feature. This change only this effects the results
-  returned by getHostCPUFeatures on CPUs that implement the MPX instructions.
-
 Changes to the WebAssembly Target
 ---------------------------------

--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@ -304,12 +304,12 @@ def FeatureFastVariableShuffle
    : SubtargetFeature<"fast-variable-shuffle",
                       "HasFastVariableShuffle",
                       "true", "Shuffles with variable masks are fast">;
-// On some X86 processors, there is no performance hazard to writing only the
-// lower parts of a YMM or ZMM register without clearing the upper part.
-def FeatureFastPartialYMMorZMMWrite
-    : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
-                       "HasFastPartialYMMorZMMWrite",
-                       "true", "Partial writes to YMM/ZMM registers are fast">;
+// On some X86 processors, a vzeroupper instruction should be inserted after
+// using ymm/zmm registers before executing code that may use SSE instructions.
+def FeatureInsertVZEROUPPER
+    : SubtargetFeature<"vzeroupper",
+                       "InsertVZEROUPPER",
+                       "true", "Should insert vzeroupper instructions">;
 // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
 // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
 // vector FSQRT has higher throughput than the corresponding NR code.
@ -525,7 +525,8 @@ def ProcessorFeatures {
                                                   FeatureCMPXCHG16B,
                                                   FeaturePOPCNT,
                                                   FeatureLAHFSAHF,
-                                                   FeatureMacroFusion];
+                                                   FeatureMacroFusion,
+                                                   FeatureInsertVZEROUPPER];
  list<SubtargetFeature> NHMSpecificFeatures = [];
  list<SubtargetFeature> NHMFeatures =
    !listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
@ -705,7 +706,8 @@ def ProcessorFeatures {
                                                    FeatureCMPXCHG16B,
                                                    FeatureMOVBE,
                                                    FeatureSlowTwoMemOps,
-                                                    FeatureLAHFSAHF];
+                                                    FeatureLAHFSAHF,
+                                                    FeatureInsertVZEROUPPER];
  list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
                                                 FeatureSlowUAMem16,
                                                 FeatureLEAForSP,
@ -807,7 +809,6 @@ def ProcessorFeatures {
                                        FeaturePRFCHW,
                                        FeaturePreferMaskRegisters,
                                        FeatureSlowTwoMemOps,
-                                        FeatureFastPartialYMMorZMMWrite,
                                        FeatureHasFastGather,
                                        FeatureSlowPMADDWD];
  // TODO Add AVX5124FMAPS/AVX5124VNNIW features
@ -828,7 +829,8 @@ def ProcessorFeatures {
                                                         FeatureLAHFSAHF,
                                                         FeatureCMOV,
                                                         Feature64Bit,
-                                                         FeatureFastScalarShiftMasks];
+                                                         FeatureFastScalarShiftMasks,
+                                                         FeatureInsertVZEROUPPER];
  list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;

  // Bobcat
@ -850,7 +852,9 @@ def ProcessorFeatures {
                                                      FeatureFast15ByteNOP,
                                                      FeatureFastScalarShiftMasks,
                                                      FeatureFastVectorShiftMasks];
-  list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
+  list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> BtVer1Features =
+    !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures);

  // Jaguar
  list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@ -863,7 +867,6 @@ def ProcessorFeatures {
                                                     FeatureXSAVEOPT];
  list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
                                                   FeatureFastBEXTR,
-                                                   FeatureFastPartialYMMorZMMWrite,
                                                   FeatureFastHorizontalOps];
  list<SubtargetFeature> BtVer2InheritableFeatures =
    !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
@ -891,7 +894,8 @@ def ProcessorFeatures {
                                                      FeatureLAHFSAHF,
                                                      FeatureFast11ByteNOP,
                                                      FeatureFastScalarShiftMasks,
-                                                      FeatureBranchFusion];
+                                                      FeatureBranchFusion,
+                                                      FeatureInsertVZEROUPPER];
  list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;

  // PileDriver
@ -954,6 +958,7 @@ def ProcessorFeatures {
                                       FeatureSHA,
                                       FeatureSSE4A,
                                       FeatureSlowSHLD,
+                                       FeatureInsertVZEROUPPER,
                                       FeatureX87,
                                       FeatureXSAVE,
                                       FeatureXSAVEC,
@ -976,28 +981,32 @@ class Proc<string Name, list<SubtargetFeature> Features>
 // NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
 // if i386/i486 is specifically requested.
 def : Proc<"generic",         [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B]>;
-def : Proc<"i386",            [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"i486",            [FeatureX87, FeatureSlowUAMem16]>;
+                               FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
+def : Proc<"i386",            [FeatureX87, FeatureSlowUAMem16,
+                               FeatureInsertVZEROUPPER]>;
+def : Proc<"i486",            [FeatureX87, FeatureSlowUAMem16,
+                               FeatureInsertVZEROUPPER]>;
 def : Proc<"i586",            [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B]>;
+                               FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
 def : Proc<"pentium",         [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B]>;
+                               FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
 def : Proc<"pentium-mmx",     [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B, FeatureMMX]>;
+                               FeatureCMPXCHG8B, FeatureMMX,
+                               FeatureInsertVZEROUPPER]>;

 def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    FeatureCMOV]>;
+                    FeatureCMOV, FeatureInsertVZEROUPPER]>;
 def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                          FeatureCMOV, FeatureNOPL]>;
+                          FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>;

 def : Proc<"pentium2",        [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                               FeatureMMX, FeatureCMOV, FeatureFXSR,
-                               FeatureNOPL]>;
+                               FeatureNOPL, FeatureInsertVZEROUPPER]>;

 foreach P = ["pentium3", "pentium3m"] in {
  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
-                 FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+                 FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV,
+                 FeatureInsertVZEROUPPER]>;
 }

 // Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@ -1013,29 +1022,29 @@ foreach P = ["pentium3", "pentium3m"] in {
 def : ProcessorModel<"pentium-m", GenericPostRAModel,
                     [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                      FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
-                      FeatureCMOV]>;
+                      FeatureCMOV, FeatureInsertVZEROUPPER]>;

 foreach P = ["pentium4", "pentium4m"] in {
  def : ProcessorModel<P, GenericPostRAModel,
                       [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                        FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
-                        FeatureCMOV]>;
+                        FeatureCMOV, FeatureInsertVZEROUPPER]>;
 }

 // Intel Quark.
-def : Proc<"lakemont",        []>;
+def : Proc<"lakemont",        [FeatureInsertVZEROUPPER]>;

 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
                     [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                      FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
-                      FeatureCMOV]>;
+                      FeatureCMOV, FeatureInsertVZEROUPPER]>;

 // NetBurst.
 def : ProcessorModel<"prescott", GenericPostRAModel,
                     [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                      FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
-                      FeatureCMOV]>;
+                      FeatureCMOV, FeatureInsertVZEROUPPER]>;
 def : ProcessorModel<"nocona", GenericPostRAModel, [
  FeatureX87,
  FeatureSlowUAMem16,
@ -1046,7 +1055,8 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
  FeatureFXSR,
  FeatureNOPL,
  Feature64Bit,
-  FeatureCMPXCHG16B
+  FeatureCMPXCHG16B,
+  FeatureInsertVZEROUPPER
 ]>;

 // Intel Core 2 Solo/Duo.
@ -1062,7 +1072,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
  Feature64Bit,
  FeatureCMPXCHG16B,
  FeatureLAHFSAHF,
-  FeatureMacroFusion
+  FeatureMacroFusion,
+  FeatureInsertVZEROUPPER
 ]>;
 def : ProcessorModel<"penryn", SandyBridgeModel, [
  FeatureX87,
@ -1076,7 +1087,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
  Feature64Bit,
  FeatureCMPXCHG16B,
  FeatureLAHFSAHF,
-  FeatureMacroFusion
+  FeatureMacroFusion,
+  FeatureInsertVZEROUPPER
 ]>;

 // Atom CPUs.
@ -1143,35 +1155,36 @@ def : ProcessorModel<"tigerlake", SkylakeServerModel,
 // AMD CPUs.

 def : Proc<"k6",   [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    FeatureMMX]>;
+                    FeatureMMX, FeatureInsertVZEROUPPER]>;
 def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    Feature3DNow]>;
+                    Feature3DNow, FeatureInsertVZEROUPPER]>;
 def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                               Feature3DNow]>;
+                    Feature3DNow, FeatureInsertVZEROUPPER]>;

 foreach P = ["athlon", "athlon-tbird"] in {
  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
-                 Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
+                 Feature3DNowA, FeatureNOPL, FeatureSlowSHLD,
+                 FeatureInsertVZEROUPPER]>;
 }

 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
                 FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
-                 FeatureSlowSHLD]>;
+                 FeatureSlowSHLD, FeatureInsertVZEROUPPER]>;
 }

 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                 FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
                 Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
-                 FeatureFastScalarShiftMasks]>;
+                 FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
 }

 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
                 Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
                 FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
-                 FeatureFastScalarShiftMasks]>;
+                 FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
 }

 foreach P = ["amdfam10", "barcelona"] in {
@ -1196,14 +1209,17 @@ def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
 def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;

 def : Proc<"geode",           [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                               Feature3DNowA]>;
+                               Feature3DNowA, FeatureInsertVZEROUPPER]>;

-def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"winchip2",        [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3",              [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2",        [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
+                               FeatureInsertVZEROUPPER]>;
+def : Proc<"c3",              [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
+                               FeatureInsertVZEROUPPER]>;
 def : Proc<"c3-2",            [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
                               FeatureMMX, FeatureSSE1, FeatureFXSR,
-                               FeatureCMOV]>;
+                               FeatureCMOV, FeatureInsertVZEROUPPER]>;

 // We also provide a generic 64-bit specific x86 processor model which tries to
 // be good for modern chips without enabling instruction set encodings past the
@ -1226,7 +1242,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
  Feature64Bit,
  FeatureSlow3OpsLEA,
  FeatureSlowIncDec,
-  FeatureMacroFusion
+  FeatureMacroFusion,
+  FeatureInsertVZEROUPPER
 ]>;

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@ -256,9 +256,9 @@ protected:
  /// mask over multiple fixed shuffles.
  bool HasFastVariableShuffle = false;

-  /// True if there is no performance penalty to writing only the lower parts
-  /// of a YMM or ZMM register without clearing the upper part.
-  bool HasFastPartialYMMorZMMWrite = false;
+  /// True if vzeroupper instructions should be inserted after code that uses
+  /// ymm or zmm registers.
+  bool InsertVZEROUPPER = false;

  /// True if there is no performance penalty for writing NOPs with up to
  /// 11 bytes.
@ -658,9 +658,7 @@ public:
  bool hasFastVariableShuffle() const {
    return HasFastVariableShuffle;
  }
-  bool hasFastPartialYMMorZMMWrite() const {
-    return HasFastPartialYMMorZMMWrite;
-  }
+  bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
  bool hasFastGather() const { return HasFastGather; }
  bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@ -51,7 +51,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
      X86::FeatureFastBEXTR,
      X86::FeatureFastHorizontalOps,
      X86::FeatureFastLZCNT,
-      X86::FeatureFastPartialYMMorZMMWrite,
      X86::FeatureFastScalarFSQRT,
      X86::FeatureFastSHLDRotate,
      X86::FeatureFastScalarShiftMasks,
@ -78,6 +77,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
      X86::FeatureSlowTwoMemOps,
      X86::FeatureSlowUAMem16,
      X86::FeaturePreferMaskRegisters,
+      X86::FeatureInsertVZEROUPPER,

      // Perf-tuning flags.
      X86::FeatureHasFastGather,
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@ -279,7 +279,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
+  if (!ST.hasAVX() || !ST.insertVZEROUPPER())
    return false;
  TII = ST.getInstrInfo();
  MachineRegisterInfo &MRI = MF.getRegInfo();
--- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/llvm/test/CodeGen/X86/avx-vzeroupper.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,-vzeroupper | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=DISABLE-VZ
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2

@ -44,18 +44,18 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 ; VZ-NEXT:    addq $56, %rsp
 ; VZ-NEXT:    retq
 ;
-; FAST-ymm-zmm-LABEL: test01:
-; FAST-ymm-zmm:       # %bb.0:
-; FAST-ymm-zmm-NEXT:    subq $56, %rsp
-; FAST-ymm-zmm-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
-; FAST-ymm-zmm-NEXT:    vmovaps {{.*}}(%rip), %xmm0
-; FAST-ymm-zmm-NEXT:    callq do_sse
-; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
-; FAST-ymm-zmm-NEXT:    callq do_sse
-; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
-; FAST-ymm-zmm-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; FAST-ymm-zmm-NEXT:    addq $56, %rsp
-; FAST-ymm-zmm-NEXT:    retq
+; DISABLE-VZ-LABEL: test01:
+; DISABLE-VZ:       # %bb.0:
+; DISABLE-VZ-NEXT:    subq $56, %rsp
+; DISABLE-VZ-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; DISABLE-VZ-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; DISABLE-VZ-NEXT:    callq do_sse
+; DISABLE-VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; DISABLE-VZ-NEXT:    callq do_sse
+; DISABLE-VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; DISABLE-VZ-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; DISABLE-VZ-NEXT:    addq $56, %rsp
+; DISABLE-VZ-NEXT:    retq
 ;
 ; BDVER2-LABEL: test01:
 ; BDVER2:       # %bb.0:
@ -83,6 +83,7 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 ; BTVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; BTVER2-NEXT:    addq $56, %rsp
 ; BTVER2-NEXT:    retq
+; DISABLE-VZ       # %bb.0:
  %tmp = load <4 x float>, <4 x float>* @x, align 16
  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
  store <4 x float> %call, <4 x float>* @x, align 16
@ -100,10 +101,10 @@ define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    jmp do_sse # TAILCALL
 ;
-; FAST-ymm-zmm-LABEL: test02:
-; FAST-ymm-zmm:       # %bb.0:
-; FAST-ymm-zmm-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; FAST-ymm-zmm-NEXT:    jmp do_sse # TAILCALL
+; DISABLE-VZ-LABEL: test02:
+; DISABLE-VZ:       # %bb.0:
+; DISABLE-VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-VZ-NEXT:    jmp do_sse # TAILCALL
 ;
 ; BDVER2-LABEL: test02:
 ; BDVER2:       # %bb.0:
@ -154,34 +155,34 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; VZ-NEXT:    popq %rbx
 ; VZ-NEXT:    retq
 ;
-; FAST-ymm-zmm-LABEL: test03:
-; FAST-ymm-zmm:       # %bb.0: # %entry
-; FAST-ymm-zmm-NEXT:    pushq %rbx
-; FAST-ymm-zmm-NEXT:    subq $16, %rsp
-; FAST-ymm-zmm-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; FAST-ymm-zmm-NEXT:    .p2align 4, 0x90
-; FAST-ymm-zmm-NEXT:  .LBB3_1: # %while.cond
-; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
-; FAST-ymm-zmm-NEXT:    callq foo
-; FAST-ymm-zmm-NEXT:    testl %eax, %eax
-; FAST-ymm-zmm-NEXT:    jne .LBB3_1
-; FAST-ymm-zmm-NEXT:  # %bb.2: # %for.body.preheader
-; FAST-ymm-zmm-NEXT:    movl $4, %ebx
-; FAST-ymm-zmm-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; FAST-ymm-zmm-NEXT:    .p2align 4, 0x90
-; FAST-ymm-zmm-NEXT:  .LBB3_3: # %for.body
-; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
-; FAST-ymm-zmm-NEXT:    callq do_sse
-; FAST-ymm-zmm-NEXT:    callq do_sse
-; FAST-ymm-zmm-NEXT:    vmovaps g+{{.*}}(%rip), %xmm0
-; FAST-ymm-zmm-NEXT:    callq do_sse
-; FAST-ymm-zmm-NEXT:    decl %ebx
-; FAST-ymm-zmm-NEXT:    jne .LBB3_3
-; FAST-ymm-zmm-NEXT:  # %bb.4: # %for.end
-; FAST-ymm-zmm-NEXT:    addq $16, %rsp
-; FAST-ymm-zmm-NEXT:    popq %rbx
-; FAST-ymm-zmm-NEXT:    retq
+; DISABLE-VZ-LABEL: test03:
+; DISABLE-VZ:       # %bb.0: # %entry
+; DISABLE-VZ-NEXT:    pushq %rbx
+; DISABLE-VZ-NEXT:    subq $16, %rsp
+; DISABLE-VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-VZ-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; DISABLE-VZ-NEXT:    .p2align 4, 0x90
+; DISABLE-VZ-NEXT:  .LBB3_1: # %while.cond
+; DISABLE-VZ-NEXT:    # =>This Inner Loop Header: Depth=1
+; DISABLE-VZ-NEXT:    callq foo
+; DISABLE-VZ-NEXT:    testl %eax, %eax
+; DISABLE-VZ-NEXT:    jne .LBB3_1
+; DISABLE-VZ-NEXT:  # %bb.2: # %for.body.preheader
+; DISABLE-VZ-NEXT:    movl $4, %ebx
+; DISABLE-VZ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; DISABLE-VZ-NEXT:    .p2align 4, 0x90
+; DISABLE-VZ-NEXT:  .LBB3_3: # %for.body
+; DISABLE-VZ-NEXT:    # =>This Inner Loop Header: Depth=1
+; DISABLE-VZ-NEXT:    callq do_sse
+; DISABLE-VZ-NEXT:    callq do_sse
+; DISABLE-VZ-NEXT:    vmovaps g+{{.*}}(%rip), %xmm0
+; DISABLE-VZ-NEXT:    callq do_sse
+; DISABLE-VZ-NEXT:    decl %ebx
+; DISABLE-VZ-NEXT:    jne .LBB3_3
+; DISABLE-VZ-NEXT:  # %bb.4: # %for.end
+; DISABLE-VZ-NEXT:    addq $16, %rsp
+; DISABLE-VZ-NEXT:    popq %rbx
+; DISABLE-VZ-NEXT:    retq
 ;
 ; BDVER2-LABEL: test03:
 ; BDVER2:       # %bb.0: # %entry
@ -279,15 +280,15 @@ define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    retq
 ;
-; FAST-ymm-zmm-LABEL: test04:
-; FAST-ymm-zmm:       # %bb.0:
-; FAST-ymm-zmm-NEXT:    pushq %rax
-; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; FAST-ymm-zmm-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; FAST-ymm-zmm-NEXT:    callq do_avx
-; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; FAST-ymm-zmm-NEXT:    popq %rax
-; FAST-ymm-zmm-NEXT:    retq
+; DISABLE-VZ-LABEL: test04:
+; DISABLE-VZ:       # %bb.0:
+; DISABLE-VZ-NEXT:    pushq %rax
+; DISABLE-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; DISABLE-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; DISABLE-VZ-NEXT:    callq do_avx
+; DISABLE-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; DISABLE-VZ-NEXT:    popq %rax
+; DISABLE-VZ-NEXT:    retq
 ;
 ; BDVER2-LABEL: test04:
 ; BDVER2:       # %bb.0: