[X86][SSE] Allow target shuffle combining to match masks with SM_Sentinel values

We currently only allow exact matches of shuffle mask patterns during target shuffle combining. This patch relaxes this to permit SM_SentinelUndef in the combined shuffle to always be accepted as well as allowing exact matching of the SM_SentinelZero value. I've adjusted some tests that were requiring exact shuffle masks to now include undef values. Differential Revision: http://reviews.llvm.org/D21495 llvm-svn: 273119
2016-06-19 18:03:52 +00:00 · 2016-06-19 18:03:52 +00:00 · 3d881a0230
parent f8ee0e0218
commit 3d881a0230
6 changed files with 66 additions and 40 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -7123,6 +7123,31 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
  return true;
 }

+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+                                      ArrayRef<int> ExpectedMask) {
+  int Size = Mask.size();
+  if (Size != ExpectedMask.size())
+    return false;
+
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] == SM_SentinelUndef)
+      continue;
+    else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
+      return false;
+    else if (Mask[i] != ExpectedMask[i])
+      return false;
+
+  return true;
+}
+
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
@ -24541,15 +24566,14 @@ static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
 // Attempt to match a combined shuffle mask against supported unary shuffle
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
-// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
                                    const X86Subtarget &Subtarget,
                                    unsigned &Shuffle, MVT &ShuffleVT) {
  bool FloatDomain = SrcVT.isFloatingPoint();

  // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
-  if (!FloatDomain && SrcVT.is128BitVector() && Mask.size() == 2 &&
-      Mask[0] == 0 && Mask[1] < 0) {
+  if (!FloatDomain && SrcVT.is128BitVector() &&
+      isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
    Shuffle = X86ISD::VZEXT_MOVL;
    ShuffleVT = MVT::v2i64;
    return true;
@ -24562,17 +24586,17 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
  // instructions are no slower than UNPCKLPD but has the option to
  // fold the input operand into even an unaligned memory load.
  if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) {
-    if (Mask.equals({0, 0})) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0})) {
      Shuffle = X86ISD::MOVDDUP;
      ShuffleVT = MVT::v2f64;
      return true;
    }
-    if (Mask.equals({0, 0, 2, 2})) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
      Shuffle = X86ISD::MOVSLDUP;
      ShuffleVT = MVT::v4f32;
      return true;
    }
-    if (Mask.equals({1, 1, 3, 3})) {
+    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
      Shuffle = X86ISD::MOVSHDUP;
      ShuffleVT = MVT::v4f32;
      return true;
@ -24581,17 +24605,17 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,

  if (SrcVT.is256BitVector()) {
    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
-    if (Mask.equals({0, 0, 2, 2})) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
      Shuffle = X86ISD::MOVDDUP;
      ShuffleVT = MVT::v4f64;
      return true;
    }
-    if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
      Shuffle = X86ISD::MOVSLDUP;
      ShuffleVT = MVT::v8f32;
      return true;
    }
-    if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7})) {
+    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
      Shuffle = X86ISD::MOVSHDUP;
      ShuffleVT = MVT::v8f32;
      return true;
@ -24601,17 +24625,19 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
  if (SrcVT.is512BitVector()) {
    assert(Subtarget.hasAVX512() &&
           "AVX512 required for 512-bit vector shuffles");
-    if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
      Shuffle = X86ISD::MOVDDUP;
      ShuffleVT = MVT::v8f64;
      return true;
    }
-    if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+    if (isTargetShuffleEquivalent(
+            Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
      Shuffle = X86ISD::MOVSLDUP;
      ShuffleVT = MVT::v16f32;
      return true;
    }
-    if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+    if (isTargetShuffleEquivalent(
+            Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
      Shuffle = X86ISD::MOVSHDUP;
      ShuffleVT = MVT::v16f32;
      return true;
@ -24624,41 +24650,41 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
 // Attempt to match a combined unary shuffle mask against supported binary
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
-// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
                                     unsigned &Shuffle, MVT &ShuffleVT) {
  bool FloatDomain = SrcVT.isFloatingPoint();

  if (SrcVT.is128BitVector()) {
-    if (Mask.equals({0, 0}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
      Shuffle = X86ISD::MOVLHPS;
      ShuffleVT = MVT::v4f32;
      return true;
    }
-    if (Mask.equals({1, 1}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
      Shuffle = X86ISD::MOVHLPS;
      ShuffleVT = MVT::v4f32;
      return true;
    }
-    if (Mask.equals({0, 0, 1, 1}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
      Shuffle = X86ISD::UNPCKL;
      ShuffleVT = MVT::v4f32;
      return true;
    }
-    if (Mask.equals({2, 2, 3, 3}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
      Shuffle = X86ISD::UNPCKH;
      ShuffleVT = MVT::v4f32;
      return true;
    }
-    if (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
-        Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
+        isTargetShuffleEquivalent(
+            Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
      Shuffle = X86ISD::UNPCKL;
      ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
      return true;
    }
-    if (Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
-        Mask.equals(
-            {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15})) {
+    if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
+        isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
+                                         13, 14, 14, 15, 15})) {
      Shuffle = X86ISD::UNPCKH;
      ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
      return true;
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@ -388,7 +388,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX-NEXT:    retq
  %1 = extractelement <4 x float> %A, i32 1
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@ -51,7 +51,7 @@ define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; ALL-NEXT:    retq
-  %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>)
+  %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
  ret <4 x float> %1
 }

@ -60,7 +60,7 @@ define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
 ; ALL-NEXT:    retq
-  %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>)
+  %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
  ret <4 x float> %1
 }

@ -86,7 +86,7 @@ define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
 ; ALL-LABEL: combine_vpermilvar_8f32_identity:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    retq
-  %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
+  %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
  ret <8 x float> %2
 }
@ -114,7 +114,7 @@ define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
 ; ALL-NEXT:    retq
-  %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>)
+  %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
  ret <8 x float> %1
 }

--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@ -37,7 +37,7 @@ define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x doub
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
-  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
  ret <8 x double> %res0
 }
 define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
@ -63,8 +63,8 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1)
 ; CHECK-LABEL: combine_vpermt2var_8i64_identity:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    retq
-  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
-  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
+  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
+  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
  ret <8 x i64> %res1
 }
 define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
@ -198,7 +198,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
-  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
  ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
@ -208,7 +208,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
  %x0 = load <16 x float>, <16 x float> *%p0
-  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
  ret <16 x float> %res0
 }

@ -216,8 +216,8 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32>
 ; CHECK-LABEL: combine_vpermt2var_16i32_identity:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    retq
-  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
-  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
+  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
  ret <16 x i32> %res1
 }
 define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
@ -261,7 +261,7 @@ define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    retq
  %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
-  %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
+  %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
  %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
  ret <64 x i8> %res1
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@ -140,12 +140,12 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,u,u,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,u,u,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX-NEXT:    retq
  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
  ret <16 x i8> %1
@ -154,12 +154,12 @@ define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,10,11,10,11,12,13,12,13,14,15,u,u]
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,10,11,10,11,12,13,12,13,14,15,u,u]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX-NEXT:    retq
  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
  ret <16 x i8> %1
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@ -119,7 +119,7 @@ define <16 x i8> @combine_vpperm_as_unary_unpckhwd(<16 x i8> %a0, <16 x i8> %a1)
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; CHECK-NEXT:    retq
-  %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
+  %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
  ret <16 x i8> %res0
 }