[x86] Teach the new vector shuffle lowering to use 'punpcklwd' and

'punpckhwd' instructions when suitable rather than falling back to the generic algorithm. While we could canonicalize to these patterns late in the process, that wouldn't help when the freedom to use them is only visible during initial lowering when undef lanes are well understood. This, it turns out, is very important for matching the shuffle patterns that are used to lower sign extension. Fixes a small but relevant regression in gcc-loops with the new lowering. When I changed this I noticed that several 'pshufd' lowerings became unpck variants. This is bad because it removes the ability to freely copy in the same instruction. I've adjusted the widening test to handle undef lanes correctly and now those will correctly continue to use 'pshufd' to lower. However, this caused a bunch of churn in the test cases. No functional change, just churn. Both of these changes are part of addressing a general weakness in the new lowering -- it doesn't sufficiently leverage undef lanes. I've at least a couple of patches that will help there at least in an academic sense. llvm-svn: 217752
2014-09-15 09:02:37 +00:00 · 2014-09-15 09:02:37 +00:00 · 44e64b5267
parent 65379c564d
commit 44e64b5267
2 changed files with 51 additions and 26 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -7667,6 +7667,12 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
+  if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
+
  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
  // such inputs we can swap two of the dwords across the half mark and end up
  // with <=2 inputs to each half in each half. Once there, we can fall through
@ -8914,7 +8920,9 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// simplified by widening the elements being shuffled.
 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
  for (int i = 0, Size = Mask.size(); i < Size; i += 2)
-    if (Mask[i] % 2 != 0 || Mask[i] + 1 != Mask[i+1])
+    if ((Mask[i] != -1 && Mask[i] % 2 != 0) ||
+        (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 ||
+                               (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1]))))
      return false;

  return true;
@ -8971,7 +8979,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
      canWidenShuffleElements(Mask)) {
    SmallVector<int, 8> NewMask;
    for (int i = 0, Size = Mask.size(); i < Size; i += 2)
-      NewMask.push_back(Mask[i] / 2);
+      NewMask.push_back(Mask[i] != -1
+                            ? Mask[i] / 2
+                            : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1));
    MVT NewVT =
        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
                         VT.getVectorNumElements() / 2);
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@ -53,6 +53,22 @@ define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
  ret <8 x i16> %shuffle
 }
+define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
+; ALL-LABEL: @shuffle_v8i16_u0u1u2u3
+; ALL:       # BB#0:
+; ALL-NEXT:    unpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
+; ALL-LABEL: @shuffle_v8i16_u4u5u6u7
+; ALL:       # BB#0:
+; ALL-NEXT:    unpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
+  ret <8 x i16> %shuffle
+}
 define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_31206745
 ; ALL:       # BB#0:
@ -482,7 +498,7 @@ define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_0c1d2e3f
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; ALL-NEXT:    punpcklwd %xmm1, %xmm0
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
@ -492,8 +508,8 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_4c5d6e7f
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
 ; ALL-NEXT:    punpcklwd %xmm1, %xmm0
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@ -503,7 +519,7 @@ define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_48596a7b
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
 ; ALL-NEXT:    punpcklwd %xmm1, %xmm0
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
@ -558,8 +574,8 @@ define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_c4d5e6f7
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm2 = xmm0[2,3,2,3]
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
 ; ALL-NEXT:    punpcklwd %xmm2, %xmm0
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
@ -603,7 +619,7 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_032dXXXX
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
+; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    punpcklwd %xmm1, %xmm0
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
@ -613,18 +629,17 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_032dXXXX
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
+; SSSE3-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; SSSE3-NEXT:    punpcklwd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[0,1,12,13,8,9,6,7,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
 ; SSSE3-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <8 x i16> %shuffle
 }
-define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
-; ALL-LABEL: @shuffle_v8i16_XXXcXXXX
+define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
+; ALL-LABEL: @shuffle_v8i16_XXXdXXXX
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,1,2,3]
-; ALL-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[0,2,2,3]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <8 x i16> %shuffle
@ -633,7 +648,7 @@ define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_012dXXXX
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
+; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    punpcklwd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
@ -643,7 +658,7 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_012dXXXX
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
+; SSSE3-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; SSSE3-NEXT:    punpcklwd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[0,1,4,5,8,9,6,7,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
 ; SSSE3-NEXT:    retq
@ -654,7 +669,7 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_XXXXcde3
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; SSE2-NEXT:    punpckhwd %xmm0, %xmm1
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
@ -663,7 +678,7 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_XXXXcde3
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; SSSE3-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    punpckhwd %xmm0, %xmm1 # xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    pshufb {{.*}} # xmm1 = xmm1[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},0,1,4,5,8,9,14,15]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
@ -675,7 +690,7 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_cde3XXXX
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; SSE2-NEXT:    punpckhwd %xmm0, %xmm1
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
@ -684,7 +699,7 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_cde3XXXX
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; SSSE3-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    punpckhwd %xmm0, %xmm1 # xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    pshufb {{.*}} # xmm1 = xmm1[0,1,4,5,8,9,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
@ -696,8 +711,8 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_012dcde3
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
-; SSE2-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
+; SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    punpckhwd %xmm2, %xmm1
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7]
@ -712,8 +727,8 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_012dcde3
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
-; SSSE3-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
+; SSSE3-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,3,0,1]
 ; SSSE3-NEXT:    punpckhwd %xmm2, %xmm1 # xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSSE3-NEXT:    pshufb {{.*}} # xmm1 = xmm1[0,1,4,5,8,9,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
 ; SSSE3-NEXT:    punpcklwd %xmm3, %xmm0 # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@ -750,7 +765,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_XX4X8acX
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@ -762,7 +777,7 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*}}    # [[X:xmm[0-9]+]] = xmm0[2,1,2,3]
+; SSSE3-NEXT:    pshufd {{.*}}    # [[X:xmm[0-9]+]] = xmm0[2,3,0,1]
 ; SSSE3-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
 ; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]