[x86] Rewrite the byte shift detection to not use boolean variables to

track state. I didn't like this in the code review because the pattern tends to be error prone, but I didn't see a clear way to rewrite it. Turns out that there were bugs here, I found them when fuzz testing our shuffle lowering for correctness on x86. The core of the problem is that we need to consistently test all our preconditions for the same directionality of shift and the same input vector. Instead, formulate this as two predicates (one doesn't depend on the input in any way), pass things like the directionality and input vector as inputs, and loop over the alternatives. This fixes a pattern of very rare miscompiles coming out of this code. Turned up roughly 4 out of every 1 million v8 shuffles in my fuzz testing. The new code is over half a million test runs with no failures yet. I've also fuzzed every other function in the lowering code with over 3.5 million test cases and not discovered any other miscompiles. llvm-svn: 229642
2015-02-18 07:13:48 +00:00 · 2015-02-18 07:13:48 +00:00 · 55553f5299
parent 1348f17205
commit 55553f5299
2 changed files with 54 additions and 23 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -7888,40 +7888,39 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
  // [  5, 6,  7, zz, zz, zz, zz, zz]
  // [ -1, 5,  6,  7, zz, zz, zz, zz]
  // [  1, 2, -1, -1, -1, -1, zz, zz]
-  auto MatchByteShift = [&](int Shift) -> SDValue {
-    bool MatchLeft = true, MatchRight = true;
-    for (int l = 0; l < NumElts; l += NumLaneElts) {
-      for (int i = 0; i < Shift; ++i)
-        MatchLeft &= Zeroable[l + i];
-      for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i)
-        MatchRight &= Zeroable[l + i];
-    }
-    if (!(MatchLeft || MatchRight))
-      return SDValue();

-    bool MatchV1 = true, MatchV2 = true;
+  auto CheckZeros = [&](int Shift, bool LeftShift) {
+    for (int l = 0; l < NumElts; l += NumLaneElts)
+      for (int i = 0; i < Shift; ++i)
+        if (!Zeroable[l + i + (LeftShift ? 0 : (NumLaneElts - Shift))])
+          return false;
+
+    return true;
+  };
+
+  auto MatchByteShift = [&](int Shift, bool LeftShift, SDValue V) {
    for (int l = 0; l < NumElts; l += NumLaneElts) {
-      unsigned Pos = MatchLeft ? Shift + l : l;
-      unsigned Low = MatchLeft ? l : Shift + l;
+      unsigned Pos = LeftShift ? Shift + l : l;
+      unsigned Low = LeftShift ? l : Shift + l;
      unsigned Len = NumLaneElts - Shift;
-      MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
-      MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts);
+      if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+                                      Low + (V == V1 ? 0 : NumElts)))
+        return SDValue();
    }
-    if (!(MatchV1 || MatchV2))
-      return SDValue();

    int ByteShift = Shift * Scale;
-    unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ;
-    SDValue V = MatchV1 ? V1 : V2;
+    unsigned Op = LeftShift ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
    V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
-    V = DAG.getNode(Op, DL, ShiftVT, V,
-                    DAG.getConstant(ByteShift, MVT::i8));
+    V = DAG.getNode(Op, DL, ShiftVT, V, DAG.getConstant(ByteShift, MVT::i8));
    return DAG.getNode(ISD::BITCAST, DL, VT, V);
  };

  for (int Shift = 1; Shift < NumLaneElts; ++Shift)
-    if (SDValue S = MatchByteShift(Shift))
-      return S;
+    for (bool LeftShift : {true, false})
+      if (CheckZeros(Shift, LeftShift))
+        for (SDValue V : {V1, V2})
+          if (SDValue S = MatchByteShift(Shift, LeftShift, V))
+            return S;

  // no match
  return SDValue();
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@ -2146,3 +2146,35 @@ define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
  ret <8 x i16> %shuffle
 }
+
+define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_fu3ucc5u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_fu3ucc5u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_fu3ucc5u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_fu3ucc5u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 15, i32 undef, i32 3, i32 undef, i32 12, i32 12, i32 5, i32 undef>
+  ret <8 x i16> %shuffle
+}