[x86] Fix two independent miscompiles in the process of getting the same

test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. llvm-svn: 214954
2014-08-06 10:16:36 +00:00 · 2014-08-06 10:16:36 +00:00 · c3927cd8c9
parent 8f23ba26d2
commit c3927cd8c9
2 changed files with 108 additions and 71 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -7386,9 +7386,10 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
  // First fix the masks for all the inputs that are staying in their
  // original halves. This will then dictate the targets of the cross-half
  // shuffles.
-  auto fixInPlaceInputs = [&PSHUFDMask](
-      ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
-      MutableArrayRef<int> HalfMask, int HalfOffset) {
+  auto fixInPlaceInputs =
+      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+                    MutableArrayRef<int> SourceHalfMask,
+                    MutableArrayRef<int> HalfMask, int HalfOffset) {
    if (InPlaceInputs.empty())
      return;
    if (InPlaceInputs.size() == 1) {
@ -7397,6 +7398,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
      return;
    }
+    if (IncomingInputs.empty()) {
+      // Just fix all of the in place inputs.
+      for (int Input : InPlaceInputs) {
+        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+        PSHUFDMask[Input / 2] = Input / 2;
+      }
+      return;
+    }

    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
@ -7408,10 +7417,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
  };
-  if (!HToLInputs.empty())
-    fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
-  if (!LToHInputs.empty())
-    fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

  // Now gather the cross-half inputs and place them into a free dword of
  // their target half.
@ -7420,7 +7427,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
  auto moveInputsToRightHalf = [&PSHUFDMask](
      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
-      int SourceOffset, int DestOffset) {
+      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+      int DestOffset) {
    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
      return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
    };
@ -7498,18 +7506,68 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
    } else if (IncomingInputs.size() == 2) {
      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
-        int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
-        assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
-               "Not all dwords can be clobbered!");
-        SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
-        SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+        // We have two non-adjacent or clobbered inputs we need to extract from
+        // the source half. To do this, we need to map them into some adjacent
+        // dword slot in the source mask.
+        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+                              IncomingInputs[1] - SourceOffset};
+
+        // If there is a free slot in the source half mask adjacent to one of
+        // the inputs, place the other input in it. We use (Index XOR 1) to
+        // compute an adjacent index.
+        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+            SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+                   SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+          InputsFixed[0] = InputsFixed[1] ^ 1;
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+          // The two inputs are in the same DWord but it is clobbered and the
+          // adjacent DWord isn't used at all. Move both inputs to the free
+          // slot.
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+        } else {
+          // The only way we hit this point is if there is no clobbering
+          // (because there are no off-half inputs to this half) and there is no
+          // free slot adjacent to one of the inputs. In this case, we have to
+          // swap an input with a non-input.
+          for (int i = 0; i < 4; ++i)
+            assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+                   "We can't handle any clobbers here!");
+          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+                 "Cannot have adjacent inputs here!");
+
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+          // We also have to update the final source mask in this case because
+          // it may need to undo the above swap.
+          for (int &M : FinalSourceHalfMask)
+            if (M == (InputsFixed[0] ^ 1))
+              M = InputsFixed[1];
+            else if (M == InputsFixed[1])
+              M = InputsFixed[0] ^ 1;
+
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        }
+
+        // Point everything at the fixed inputs.
        for (int &M : HalfMask)
          if (M == IncomingInputs[0])
-            M = SourceDWordBase + SourceOffset;
+            M = InputsFixed[0] + SourceOffset;
          else if (M == IncomingInputs[1])
-            M = SourceDWordBase + 1 + SourceOffset;
-        IncomingInputs[0] = SourceDWordBase + SourceOffset;
-        IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+            M = InputsFixed[1] + SourceOffset;
+
+        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
      }
    } else {
      llvm_unreachable("Unhandled input size!");
@ -7524,9 +7582,9 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
        if (M == Input)
          M = FreeDWord * 2 + Input % 2;
  };
-  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
-  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
                        /*SourceOffset*/ 0, /*DestOffset*/ 4);

  // Now enact all the shuffles we've computed to move the inputs into their
@ -19391,26 +19449,6 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,

      // Other-half shuffles are no-ops.
      continue;
-
-    case X86ISD::PSHUFD: {
-      // We can only handle pshufd if the half we are combining either stays in
-      // its half, or switches to the other half. Bail if one of these isn't
-      // true.
-      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
-      int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
-      if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
-            (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
-        return false;
-
-      // Map the mask through the pshufd and keep walking up the chain.
-      for (int i = 0; i < 4; ++i)
-        Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
-
-      // Switch halves if the pshufd does.
-      CombineOpcode =
-          VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
-      continue;
-    }
    }
    // Break out of the loop if we break out of the switch.
    break;
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@ -173,9 +173,9 @@ define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_26405173
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
 ; SSE2-NEXT:    retq
 ;
@ -190,9 +190,9 @@ define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_20645173
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
 ; SSE2-NEXT:    retq
 ;
@ -207,9 +207,9 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_26401375
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_26401375
@ -404,9 +404,8 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_45630127
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,3,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_45630127
@ -652,8 +651,9 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_XXX1X579
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,7,5,6,7]
 ; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[0,1,3,2,4,5,6,7]
 ; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,1,2,1]
@ -663,36 +663,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_XXX1X579
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[{{[0-9]+,[0-9]+}},2,3,10,11,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
 ; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
 ; SSSE3-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
  ret <8 x i16> %shuffle
 }

 define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
-; FIXME-SSE2-LABEL: @shuffle_v8i16_XX4X8acX
-; FIXME-SSE2:       # BB#0:
-; FIXME-SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
-; FIXME-SSE2-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; FIXME-SSE2-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
-; FIXME-SSE2-NEXT:    punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; FIXME-SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,1,0,2,4,5,6,7]
-; FIXME-SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,1,2,1]
-; FIXME-SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; FIXME-SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,6,7,4,7]
-; FIXME-SSE2-NEXT:    retq
+; SSE2-LABEL: @shuffle_v8i16_XX4X8acX
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT:    retq
 ;
-; FIXME-SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
-; FIXME-SSSE3:       # BB#0:
-; FIXME-SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
-; FIXME-SSSE3-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,5,7,6,7]
-; FIXME-SSSE3-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
-; FIXME-SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; FIXME-SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},0,1,{{[0-9]+,[0-9]+}},2,3,6,7,10,11{{[0-9]+,[0-9]+}}]
-; FIXME-SSSE3-NEXT:    retq
+; SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*}}    # [[X:xmm[0-9]+]] = xmm0[2,1,2,3]
+; SSSE3-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},2,3,{{[0-9]+,[0-9]+}},0,1,4,5,8,9,{{[0-9]+,[0-9]+}}]
+; SSSE3-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
  ret <8 x i16> %shuffle
 }