diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2fcd0dc7fc4f..abbc3d167fc0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9637,6 +9637,58 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, DAG.getUNDEF(MVT::v8i16), Mask); } +/// \brief Helper to form a PSHUFB-based shuffle+blend. +static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG, bool &V1InUse, + bool &V2InUse) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V1Mask[16]; + SDValue V2Mask[16]; + V1InUse = false; + V2InUse = false; + + int Size = Mask.size(); + int Scale = 16 / Size; + for (int i = 0; i < 16; ++i) { + if (Mask[i / Scale] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); + } else { + const int ZeroMask = 0x80; + int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale + : ZeroMask; + int V2Idx = Mask[i / Scale] < Size + ? ZeroMask + : (Mask[i / Scale] - Size) * Scale + i % Scale; + if (Zeroable[i / Scale]) + V1Idx = V2Idx = ZeroMask; + V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); + V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); + V1InUse |= (ZeroMask != V1Idx); + V2InUse |= (ZeroMask != V2Idx); + } + } + + if (V1InUse) + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (V2InUse) + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + + // If we need shuffled inputs from both, blend the two. + SDValue V; + if (V1InUse && V2InUse) + V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + else + V = V1InUse ? V1 : V2; + + // Cast the result back to the correct type. + return DAG.getNode(ISD::BITCAST, DL, VT, V); +} + /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with @@ -9746,32 +9798,26 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); } - // If we have direct support for blends, we should lower by decomposing into - // a permute. - if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, - Mask, DAG); + // Try to lower by permuting the inputs into an unpack instruction unless we + // have direct support for blending. + if (!IsBlendSupported) { + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG)) + return Unpack; - // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG)) - return Unpack; - - int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - - for (int i = 0; i < 4; ++i) { - LoBlendMask[i] = Mask[i]; - HiBlendMask[i] = Mask[i + 4]; + // If we can use PSHUFB, that will be better as it can both shuffle and set + // up an efficient blend. + if (Subtarget->hasSSSE3()) { + bool V1InUse, V2InUse; + return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, + V1InUse, V2InUse); + } } - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); - LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); - HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); - - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); + // We can always bit-blend if we have to so the fallback strategy is to + // decompose into single-input permutes and blends. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, DAG); } /// \brief Check whether a compaction lowering can be done by dropping even @@ -10002,27 +10048,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget->hasSSSE3()) { - SDValue V1Mask[16]; - SDValue V2Mask[16]; bool V1InUse = false; bool V2InUse = false; - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - for (int i = 0; i < 16; ++i) { - if (Mask[i] == -1) { - V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); - } else { - const int ZeroMask = 0x80; - int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask); - int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16); - if (Zeroable[i]) - V1Idx = V2Idx = ZeroMask; - V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); - V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); - V1InUse |= (ZeroMask != V1Idx); - V2InUse |= (ZeroMask != V2Idx); - } - } + SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, + DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is @@ -10046,22 +10076,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Unpack; } - if (V1InUse) - V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); - if (V2InUse) - V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); - - // If we need shuffled inputs from both, blend the two. - if (V1InUse && V2InUse) - return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); - if (V1InUse) - return V1; // Single inputs are easy. - if (V2InUse) - return V2; // Single inputs are easy. - // Shuffling to a zeroable vector. - return getZeroVector(MVT::v16i8, Subtarget, DAG, DL); + return PSHUFB; } // There are special ways we can lower some single-element blends. diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll index dba18e4480cd..a0450dc8fca2 100644 --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -41,11 +41,11 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { ; X64-LABEL: t2: ; X64: ## BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7] -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7] +; X64-NEXT: pandn %xmm1, %xmm2 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 7324b7d7298a..6b643f9c411e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1100,27 +1100,27 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE2: # BB#0: # %entry ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[3,1,2,3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,1,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,1,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: packuswb %xmm2, %xmm3 +; SSE2-NEXT: packuswb %xmm5, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 730a152c18de..1822610d660f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1267,29 +1267,21 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_012dcde3: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_012dcde3: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,8,9,10,11,12,13],zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7] +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v8i16_012dcde3: