From a3dc1ba142c5c1f8e4b53e501597904c23e5c61f Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Sat, 28 May 2016 14:38:04 +0000 Subject: [PATCH] [X86] Try to zero elts when lowering 256-bit shuffle with PSHUFB. Otherwise we fallback to a blend of PSHUFBs later on. Differential Revision: http://reviews.llvm.org/D19661 llvm-svn: 271113 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 101 ++++++++++++------ .../CodeGen/X86/vector-shuffle-256-v16.ll | 18 ++++ .../CodeGen/X86/vector-shuffle-256-v32.ll | 16 +++ 3 files changed, 100 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1d92d7b33921..f3b4c70b7122 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7170,6 +7170,59 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, return Zeroable; } +/// Mutate a shuffle mask, replacing zeroable elements with SM_SentinelZero. +static void computeZeroableShuffleMask(MutableArrayRef Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + Mask[i] = SM_SentinelZero; + } +} + +/// Try to lower a shuffle with a single PSHUFB of V1. +/// This is only possible if V2 is unused (at all, or only for zero elements). +static SDValue lowerVectorShuffleWithPSHUFB(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + const int NumBytes = VT.is128BitVector() ? 16 : 32; + const int NumEltBytes = VT.getScalarSizeInBits() / 8; + + assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || + (Subtarget.hasAVX2() && VT.is256BitVector())); + + SmallVector ZeroableMask(Mask.begin(), Mask.end()); + computeZeroableShuffleMask(ZeroableMask, V1, V2); + + if (!isSingleInputShuffleMask(ZeroableMask) || + is128BitLaneCrossingShuffleMask(VT, Mask)) + return SDValue(); + + SmallVector PSHUFBMask(NumBytes); + // Sign bit set in i8 mask means zero element. + SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); + + for (int i = 0; i < NumBytes; ++i) { + int M = ZeroableMask[i / NumEltBytes]; + if (M == SM_SentinelUndef) { + PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); + } else if (M == SM_SentinelZero) { + PSHUFBMask[i] = ZeroMask; + } else { + M = M * NumEltBytes + (i % NumEltBytes); + M = i < 16 ? M : M - 16; + PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); + } + } + + MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1), + DAG.getBuildVector(I8VT, DL, PSHUFBMask))); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef Mask, @@ -11389,26 +11442,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerV8I16GeneralSingleInputVectorShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } - - SDValue PSHUFBMask[32]; - for (int i = 0; i < 16; ++i) { - if (Mask[i] == -1) { - PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); - continue; - } - - int M = i < 8 ? Mask[i] : Mask[i] - 8; - assert(M >= 0 && M < 8 && "Invalid single-input mask!"); - PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); - PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); - } - return DAG.getBitcast( - MVT::v16i16, - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, - DAG.getBitcast(MVT::v32i8, V1), - DAG.getBuildVector(MVT::v32i8, DL, PSHUFBMask))); } + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, + V2, Subtarget, DAG)) + return PSHUFB; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -11471,24 +11510,16 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return V; - if (isSingleInputShuffleMask(Mask)) { - // There are no generalized cross-lane shuffle operations available on i8 - // element types. - if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, - Mask, DAG); + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (isSingleInputShuffleMask(Mask) && + is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, + DAG); - SDValue PSHUFBMask[32]; - for (int i = 0; i < 32; ++i) - PSHUFBMask[i] = - Mask[i] < 0 - ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL, - MVT::i8); - - return DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, V1, - DAG.getBuildVector(MVT::v32i8, DL, PSHUFBMask)); - } + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, + V2, Subtarget, DAG)) + return PSHUFB; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index cc12397a9a7c..23fbba61dabd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2378,6 +2378,24 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX1: # BB#0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 846e606a3616..80cbaab164c1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -953,6 +953,22 @@ define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX1: # BB#0: