[x86] Add a dedicated lowering path for zext-compatible vector shuffles

to the new vector shuffle lowering code. This allows us to emit PMOVZX variants consistently for patterns where it is a viable lowering. This instruction is both fast and allows us to fold loads into it. This only hooks the new lowering up for i16 and i8 element widths, mostly so I could manage the change to the tests. I'll add the i32 one next, although it is significantly less interesting. One thing to note is that we already had some tests for these patterns but those tests had far less horrible instructions. The problem is that those tests weren't checking the strict start and end of the instruction sequence. =[ As a consequence something changed in the lowering making us generate *TERRIBLE* code for these patterns in SSE2 through SSSE3. I've consolidated all of the tests and spelled out the madness that we currently emit for these shuffles. I'm going to try to figure out what has gone wrong here. llvm-svn: 218102
2014-09-19 06:07:49 +00:00 · 2014-09-19 06:07:49 +00:00 · 398ba9a018
parent ffbc690933
commit 398ba9a018
3 changed files with 321 additions and 35 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -19,6 +19,7 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@ -7353,6 +7354,125 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
                                 DAG.getConstant(Rotation * Scale, MVT::i8)));
 }

+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                                     SDValue V1, SDValue V2) {
+  SmallBitVector Zeroable(Mask.size(), false);
+
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    // Handle the easy cases.
+    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+      Zeroable[i] = true;
+      continue;
+    }
+
+    // If this is an index into a build_vector node, dig out the input value and
+    // use it.
+    SDValue V = M < Size ? V1 : V2;
+    if (V.getOpcode() != ISD::BUILD_VECTOR)
+      continue;
+
+    SDValue Input = V.getOperand(M % Size);
+    // The UNDEF opcode check really should be dead code here, but not quite
+    // worth asserting on (it isn't invalid, just unexpected).
+    if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+      Zeroable[i] = true;
+  }
+
+  return Zeroable;
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension.
+///
+/// This tries to use the SSE4.1 PMOVZX instruction family to lower a vector
+/// shuffle throuh a zero extension. It doesn't check for the availability or
+/// profitability of this lowering though, it tries to aggressively match this
+/// pattern. It handles both blends with all-zero inputs to explicitly
+/// zero-extend and undef-lanes (sometimes undef due to masking out later).
+static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
+                                              SDValue V2, ArrayRef<int> Mask,
+                                              SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  int Bits = VT.getSizeInBits();
+  int EltBits = VT.getScalarSizeInBits();
+  int NumElements = Mask.size();
+
+  // Define a helper function to check a particular zext-stride and lower to it
+  // if valid.
+  auto LowerWithStride = [&](int Stride) -> SDValue {
+    SDValue InputV;
+    for (int i = 0; i < NumElements; ++i) {
+      if (Mask[i] == -1)
+        continue; // Valid anywhere but doesn't tell us anything.
+      if (i % Stride != 0) {
+        // Each of the extend elements needs to be zeroable.
+        if (!Zeroable[i])
+          return SDValue();
+        else
+          continue;
+      }
+
+      // Each of the base elements needs to be consecutive indices into the
+      // same input vector.
+      SDValue V = Mask[i] < NumElements ? V1 : V2;
+      if (!InputV)
+        InputV = V;
+      else if (InputV != V)
+        return SDValue(); // Flip-flopping inputs.
+
+      if (Mask[i] % NumElements != i / Stride)
+        return SDValue(); // Non-consecutive strided elemenst.
+    }
+
+    // If we fail to find an input, we have a zero-shuffle which should always
+    // have already been handled.
+    // FIXME: Maybe handle this here in case during blending we end up with one?
+    if (!InputV)
+      return SDValue();
+
+    // Found a valid lowering! Compute all the types and the operation. We force
+    // everything to integer types here as that's the only way zext makes sense.
+    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride),
+                                 NumElements / Stride);
+
+    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+  };
+
+  // The widest stride possible for zero extending is to a 64-bit integer.
+  assert(Bits % 64 == 0 &&
+         "The number of bits in a vector must be divisible by 64 on x86!");
+  int NumExtElements = Bits / 64;
+
+  // Each iteration, try extending the elements half as much, but into twice as
+  // many elements.
+  for (; NumExtElements < NumElements; NumExtElements *= 2) {
+    assert(
+        NumElements % NumExtElements == 0 &&
+        "The input vector size must be divisble by the extended size.");
+    int Stride = NumElements / NumExtElements;
+    if (SDValue V = LowerWithStride(Stride))
+      return V;
+  }
+
+  // No viable zext lowering found.
+  return SDValue();
+}
+
 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
 ///
 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
@ -8390,6 +8510,14 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative.
+  if (Subtarget->hasSSE41())
+    if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v8i16, V1, V2,
+                                                      OrigMask, DAG))
+      return ZExt;
+
+
  auto isV1 = [](int M) { return M >= 0 && M < 8; };
  auto isV2 = [](int M) { return M >= 8; };

@ -8553,6 +8681,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                        OrigMask, DAG))
      return Rotate;

+  // Try to use a zext lowering.
+  if (Subtarget->hasSSE41())
+    if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v16i8, V1, V2,
+                                                      OrigMask, DAG))
+      return ZExt;
+
  int MaskStorage[16] = {
      OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
      OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@ -291,41 +291,6 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(
  ret <16 x i8> %shuffle
 }

-define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) {
-; SSE2-LABEL: @zext_to_v8i16_shuffle
-; SSE2:         pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-;
-; SSSE3-LABEL: @zext_to_v8i16_shuffle
-; SSSE3:         pxor %xmm1, %xmm1
-; SSSE3-NEXT:    punpcklbw %xmm1, %xmm0
-;
-; SSE41-LABEL: @zext_to_v8i16_shuffle
-; SSE41:         pxor %xmm1, %xmm1
-; SSE41-NEXT:    punpcklbw %xmm1, %xmm0
-  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
-  ret <16 x i8> %shuffle
-}
-
-define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) {
-; SSE2-LABEL: @zext_to_v4i32_shuffle
-; SSE2:         pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-; SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-;
-; SSSE3-LABEL: @zext_to_v4i32_shuffle
-; SSSE3:         pxor %xmm1, %xmm1
-; SSSE3-NEXT:    punpcklbw %xmm1, %xmm0
-; SSSE3-NEXT:    punpcklbw %xmm1, %xmm0
-;
-; SSE41-LABEL: @zext_to_v4i32_shuffle
-; SSE41:         pxor %xmm1, %xmm1
-; SSE41-NEXT:    punpcklbw %xmm1, %xmm0
-; SSE41-NEXT:    punpcklbw %xmm1, %xmm0
-  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
-  ret <16 x i8> %shuffle
-}
-
 define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
 ; SSE2-LABEL: @trunc_v4i32_shuffle
 ; SSE2:       # BB#0:
@ -545,3 +510,98 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(
  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
  ret <16 x i8> %shuffle
 }
+
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
+; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
+; SSSE3-NEXT:    pxor %[[X2:xmm[0-9]+]], %[[X2]]
+; SSSE3-NEXT:    pshufb {{.*}}    # [[X2]] = zero,[[X2]][2,4,6],zero,[[X2]][10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm0, %[[X2]]
+; SSSE3-NEXT:    punpcklbw {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3],[[X2]][4],[[X1]][4],[[X2]][5],[[X1]][5],[[X2]][6],[[X1]][6],[[X2]][7],[[X1]][7]
+; SSSE3-NEXT:    movdqa %[[X2]], %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
+; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
+; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
+; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
+; SSSE3-LABEL: @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
+; SSSE3-LABEL: @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
+; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
+  ret <16 x i8> %shuffle
+}
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@ -981,3 +981,95 @@ define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
  ret <8 x i16> %shuffle
 }
+
+define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
+; SSE2-LABEL: @shuffle_v8i16_0uuu1uuu
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*}}  # xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: @shuffle_v8i16_0uuu1uuu
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*}}  # xmm0 = xmm0[0,1,0,3]
+; SSSE3-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v8i16_0uuu1uuu
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
+; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
+; SSE2-NEXT:    pxor %[[X2:xmm[0-9]+]], %[[X2]]
+; SSE2-NEXT:    punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = [[X2]][0,3,2,1]
+; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
+; SSSE3-NEXT:    pxor %[[X2:xmm[0-9]+]], %[[X2]]
+; SSSE3-NEXT:    punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
+; SSSE3-NEXT:    pshufb {{.*}} # [[X2]] = [[X2]][2,3,8,9,6,7,0,1,8,9,6,7,4,5,6,7]
+; SSSE3-NEXT:    punpcklwd {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3]
+; SSSE3-NEXT:    movdqa %[[X2]], %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
+; SSE2-LABEL: @shuffle_v8i16_0u1u2u3u
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: @shuffle_v8i16_0u1u2u3u
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v8i16_0u1u2u3u
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
+; SSE2-LABEL: @shuffle_v8i16_0z1z2z3z
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: @shuffle_v8i16_0z1z2z3z
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v8i16_0z1z2z3z
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i16> %shuffle
+}