[X86] Remove VPERM2F128/VPERM2I128 intrinsics and autoupgrade to native shuffles.

I've moved the test cases from the InstCombine optimizations to the backend to keep the coverage we had there. It covered every possible immediate so I've preserved the resulting shuffle mask for each of those immediates. llvm-svn: 313450
2017-09-16 07:36:14 +00:00 · 2017-09-16 07:36:14 +00:00 · f264fcc704
parent 6c196978eb
commit f264fcc704
11 changed files with 365 additions and 470 deletions
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@ -965,17 +965,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
        GCCBuiltin<"__builtin_ia32_vpermilvarps256">,
        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>;

-  // TODO: Remove and autoupgrade using implementation in CGBuiltins.cpp
-  def int_x86_avx_vperm2f128_pd_256 :
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vperm2f128_ps_256 :
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vperm2f128_si_256 :
-        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                  llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-
  def int_x86_avx512_mask_vpermi2var_d_128 :
       GCCBuiltin<"__builtin_ia32_vpermi2vard128_mask">,
        Intrinsic<[llvm_v4i32_ty],
@ -1950,10 +1939,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_avx2_permps : GCCBuiltin<"__builtin_ia32_permvarsf256">,
              Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty],
                        [IntrNoMem]>;
-  // TODO: Remove and autoupgrade using implementation in CGBuiltins.cpp
-  def int_x86_avx2_vperm2i128 :
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
 }

 // Conditional load ops
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@ -83,6 +83,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
      Name.startswith("avx2.pcmpgt.") || // Added in 3.1
      Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
      Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
+      Name.startswith("avx.vperm2f128.") || // Added in 6.0
+      Name == "avx2.vperm2i128" || // Added in 6.0
      Name == "sse.add.ss" || // Added in 4.0
      Name == "sse2.add.sd" || // Added in 4.0
      Name == "sse.sub.ss" || // Added in 4.0
@ -1426,6 +1428,42 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
      if (CI->getNumArgOperands() == 4)
        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                            CI->getArgOperand(2));
+    } else if (IsX86 && (Name.startswith("avx.vperm2f128.") ||
+                         Name == "avx2.vperm2i128")) {
+      // The immediate permute control byte looks like this:
+      //    [1:0] - select 128 bits from sources for low half of destination
+      //    [2]   - ignore
+      //    [3]   - zero low half of destination
+      //    [5:4] - select 128 bits from sources for high half of destination
+      //    [6]   - ignore
+      //    [7]   - zero high half of destination
+
+      uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned HalfSize = NumElts / 2;
+      SmallVector<uint32_t, 8> ShuffleMask(NumElts);
+
+      // Determine which operand(s) are actually in use for this instruction.
+      Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+      Value *V1 = (Imm & 0x20) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+
+      // If needed, replace operands based on zero mask.
+      V0 = (Imm & 0x08) ? ConstantAggregateZero::get(CI->getType()) : V0;
+      V1 = (Imm & 0x80) ? ConstantAggregateZero::get(CI->getType()) : V1;
+
+      // Permute low half of result.
+      unsigned StartIndex = (Imm & 0x01) ? HalfSize : 0;
+      for (unsigned i = 0; i < HalfSize; ++i)
+        ShuffleMask[i] = StartIndex + i;
+
+      // Permute high half of result.
+      StartIndex = (Imm & 0x10) ? HalfSize : 0;
+      for (unsigned i = 0; i < HalfSize; ++i)
+        ShuffleMask[i + HalfSize] = NumElts + StartIndex + i;
+
+      Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+
    } else if (IsX86 && (Name.startswith("avx.vpermil.") ||
                         Name == "sse2.pshuf.d" ||
                         Name.startswith("avx512.mask.vpermil.p") ||
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -12174,6 +12174,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
  // If either input operand is a zero vector, use VPERM2X128 because its mask
  // allows us to replace the zero input with an implicit zero.
  if (!IsV1Zero && !IsV2Zero) {
+    // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
+    if (Subtarget.hasAVX2() && V2.isUndef())
+      return SDValue();
+
    // Check for patterns which can be matched with a single insert of a 128-bit
    // subvector.
    bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@ -365,9 +365,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
  X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
  X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
  X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
@ -424,7 +421,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
  X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
  X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
  X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
  X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
  X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -1094,72 +1094,6 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II,
  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
 }

-/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
-/// source vectors, unless a zero bit is set. If a zero bit is set,
-/// then ignore that half of the mask and clear that half of the vector.
-static Value *simplifyX86vperm2(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
-  if (!CInt)
-    return nullptr;
-
-  VectorType *VecTy = cast<VectorType>(II.getType());
-  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
-  // The immediate permute control byte looks like this:
-  //    [1:0] - select 128 bits from sources for low half of destination
-  //    [2]   - ignore
-  //    [3]   - zero low half of destination
-  //    [5:4] - select 128 bits from sources for high half of destination
-  //    [6]   - ignore
-  //    [7]   - zero high half of destination
-
-  uint8_t Imm = CInt->getZExtValue();
-
-  bool LowHalfZero = Imm & 0x08;
-  bool HighHalfZero = Imm & 0x80;
-
-  // If both zero mask bits are set, this was just a weird way to
-  // generate a zero vector.
-  if (LowHalfZero && HighHalfZero)
-    return ZeroVector;
-
-  // If 0 or 1 zero mask bits are set, this is a simple shuffle.
-  unsigned NumElts = VecTy->getNumElements();
-  unsigned HalfSize = NumElts / 2;
-  SmallVector<uint32_t, 8> ShuffleMask(NumElts);
-
-  // The high bit of the selection field chooses the 1st or 2nd operand.
-  bool LowInputSelect = Imm & 0x02;
-  bool HighInputSelect = Imm & 0x20;
-
-  // The low bit of the selection field chooses the low or high half
-  // of the selected operand.
-  bool LowHalfSelect = Imm & 0x01;
-  bool HighHalfSelect = Imm & 0x10;
-
-  // Determine which operand(s) are actually in use for this instruction.
-  Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-  Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-
-  // If needed, replace operands based on zero mask.
-  V0 = LowHalfZero ? ZeroVector : V0;
-  V1 = HighHalfZero ? ZeroVector : V1;
-
-  // Permute low half of result.
-  unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
-  for (unsigned i = 0; i < HalfSize; ++i)
-    ShuffleMask[i] = StartIndex + i;
-
-  // Permute high half of result.
-  StartIndex = HighHalfSelect ? HalfSize : 0;
-  StartIndex += NumElts;
-  for (unsigned i = 0; i < HalfSize; ++i)
-    ShuffleMask[i + HalfSize] = StartIndex + i;
-
-  return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
-}
-
 /// Decode XOP integer vector comparison intrinsics.
 static Value *simplifyX86vpcom(const IntrinsicInst &II,
                               InstCombiner::BuilderTy &Builder,
@ -3018,14 +2952,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    }
    break;

-  case Intrinsic::x86_avx_vperm2f128_pd_256:
-  case Intrinsic::x86_avx_vperm2f128_ps_256:
-  case Intrinsic::x86_avx_vperm2f128_si_256:
-  case Intrinsic::x86_avx2_vperm2i128:
-    if (Value *V = simplifyX86vperm2(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
  case Intrinsic::x86_avx_maskload_ps:
  case Intrinsic::x86_avx_maskload_pd:
  case Intrinsic::x86_avx_maskload_ps_256:
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@ -762,3 +762,66 @@ define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
  ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_pd_256:
+; X86:       # BB#0:
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_pd_256:
+; X64:       # BB#0:
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_ps_256:
+; X86:       # BB#0:
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_ps_256:
+; X64:       # BB#0:
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_si_256:
+; X86:       # BB#0:
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_si_256:
+; X64:       # BB#0:
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@ -612,42 +612,6 @@ define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone


-define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT:    retl # encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT:    retl # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT:    retl # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-
-
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
 ; AVX-LABEL: test_x86_avx_vpermilvar_pd:
 ; AVX:       # BB#0:
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512

 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK-LABEL: test_x86_avx2_pblendw:
@ -565,3 +565,18 @@ define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
 }
 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone

+
+define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX2-LABEL: test_x86_avx2_vperm2i128:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    retl
+;
+; AVX512-LABEL: test_x86_avx2_vperm2i128:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT:    retl
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@ -1002,18 +1002,6 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly


-define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_vperm2i128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01]
-; CHECK-NEXT:    ## ymm0 = ymm0[2,3,0,1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
-
-
 define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx2_maskload_q:
 ; CHECK:       ## BB#0:
@ -1259,18 +1247,18 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
 ; AVX2:       ## BB#0:
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; AVX2-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
-; AVX2-NEXT:    vpsravd LCPI85_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; AVX2-NEXT:    vpsravd LCPI84_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
 ; AVX2-NEXT:    retl ## encoding: [0xc3]
 ;
 ; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmovdqa LCPI85_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; AVX512VL-NEXT:    vmovdqa LCPI84_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
-; AVX512VL-NEXT:    vpsravd LCPI85_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
+; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; AVX512VL-NEXT:    vpsravd LCPI84_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
  ret <4 x i32> %res
@ -1296,18 +1284,18 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1)
 ; AVX2:       ## BB#0:
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; AVX2-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4
-; AVX2-NEXT:    vpsravd LCPI87_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
+; AVX2-NEXT:    vpsravd LCPI86_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
 ; AVX2-NEXT:    retl ## encoding: [0xc3]
 ;
 ; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmovdqa LCPI87_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX512VL-NEXT:    vmovdqa LCPI86_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4
-; AVX512VL-NEXT:    vpsravd LCPI87_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4
+; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
+; AVX512VL-NEXT:    vpsravd LCPI86_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
  ret <8 x i32> %res
--- a/llvm/test/CodeGen/X86/x86-vperm2.ll
+++ b/llvm/test/CodeGen/X86/x86-vperm2.ll
@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+; Test cases derived from the possible immediate values of the vperm2f128 intrinsics.
+
+define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x00:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x00]
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x01:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $1, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x01]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x02:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x18,0xc0,0x01]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x03:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x11:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $17, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x11]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x12:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vblendpd $12, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x0d,0xc0,0x0c]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1],ymm0[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $49, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x31]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3],ymm0[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x20:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x21:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x21]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x22:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $0, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x00]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x23:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $1, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x01]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x30:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c]
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x31:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $49, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x31]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## encoding: [0xc5,0xfc,0x28,0xc1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x33:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $17, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x11]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3,2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: perm2ps_0x31:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $49, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x31]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %1
+}
+
+define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: perm2i_0x33:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $17, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x11]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3,2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x i64> %a1, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i64> %1
+}
+
+define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x81:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $129, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x81]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],zero,zero
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x83:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $129, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x81]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3],zero,zero
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x28:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $40, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x28]
+; CHECK-NEXT:    ## ymm0 = zero,zero,ymm1[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x08:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $40, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x28]
+; CHECK-NEXT:    ## ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> zeroinitializer, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: perm2i_0x28:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $40, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x28]
+; CHECK-NEXT:    ## ymm0 = zero,zero,ymm1[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i64> %1
+}
--- a/llvm/test/Transforms/InstCombine/X86/x86-vperm2.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm2.ll
@ -1,313 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
-
-define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
-; CHECK-LABEL: @perm2pd_non_const_imm(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-; CHECK-NEXT:    ret <4 x double> [[RES]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-  ret <4 x double> %res
-
-}
-
-
-; In the following 4 tests, both zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x88(
-; CHECK-NEXT:    ret <4 x double> zeroinitializer
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
-  ret <4 x double> %res
-
-}
-
-define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x88(
-; CHECK-NEXT:    ret <8 x float> zeroinitializer
-;
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
-  ret <8 x float> %res
-
-}
-
-define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @perm2si_0x88(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
-  ret <8 x i32> %res
-
-}
-
-define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x88(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
-  ret <4 x i64> %res
-
-}
-
-
-; The other control bits are ignored when zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0xff(
-; CHECK-NEXT:    ret <4 x double> zeroinitializer
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
-  ret <4 x double> %res
-
-}
-
-
-; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
-; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..
-
-define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x00(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x01(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x02(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x03(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x10(
-; CHECK-NEXT:    ret <4 x double> %a0
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x11(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x12(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x13(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x20(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x21(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x22(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x30(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x31(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x32(
-; CHECK-NEXT:    ret <4 x double> %a1
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x33(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
-  ret <4 x double> %res
-
-}
-
-; Confirm that a mask for 32-bit elements is also correct.
-
-define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x31(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
-  ret <8 x float> %res
-
-}
-
-
-; Confirm that the AVX2 version works the same.
-
-define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x33(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
-  ret <4 x i64> %res
-
-}
-
-
-; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
-
-define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x81(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x83(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x28(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x08(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
-  ret <4 x double> %res
-
-}
-
-; Check one more with the AVX2 version.
-
-define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x28(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
-  ret <4 x i64> %res
-
-}
-
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone
-