From 34a054ce7170060a6ec9ef4eb2dcacd69225715a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 15 Feb 2020 20:04:15 +0000 Subject: [PATCH] [X86] combineX86ShuffleChain - add support for combining to X86ISD::ROTLI Refactors matchShuffleAsBitRotate to allow use by both lowerShuffleAsBitRotate and matchUnaryPermuteShuffle. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 98 +++++++++++-------- .../X86/vector-shuffle-combining-avx2.ll | 15 ++- .../X86/vector-shuffle-combining-ssse3.ll | 21 +++- .../X86/vector-shuffle-combining-xop.ll | 2 +- 4 files changed, 88 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3bc3c4a97825..c9b4b5967a59 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11704,24 +11704,12 @@ static int matchShuffleAsBitRotate(ArrayRef Mask, int NumSubElts) { return RotateAmt; } -/// Lower shuffle using X86ISD::VROTLI rotations. -static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, + const X86Subtarget &Subtarget, + ArrayRef Mask) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); - - MVT SVT = VT.getScalarType(); - int EltSizeInBits = SVT.getScalarSizeInBits(); assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers"); - // Only XOP + AVX512 targets have bit rotation instructions. - // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this. - bool IsLegal = - (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512(); - if (!IsLegal && Subtarget.hasSSE3()) - return SDValue(); - // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; int MaxSubElts = 64 / EltSizeInBits; @@ -11730,36 +11718,55 @@ static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, if (RotateAmt < 0) continue; - int NumElts = VT.getVectorNumElements(); + int NumElts = Mask.size(); MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); - MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); + RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); + return RotateAmt * EltSizeInBits; + } - // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL, - // expanded to OR(SRL,SHL), will be more efficient, but if they can - // widen to vXi16 or more then existing lowering should will be better. - int RotateAmtInBits = RotateAmt * EltSizeInBits; - if (!IsLegal) { - if ((RotateAmtInBits % 16) == 0) - return SDValue(); - // TODO: Use getTargetVShiftByConstNode. - unsigned ShlAmt = RotateAmtInBits; - unsigned SrlAmt = RotateSVT.getScalarSizeInBits() - RotateAmtInBits; - V1 = DAG.getBitcast(RotateVT, V1); - SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1, - DAG.getTargetConstant(ShlAmt, DL, MVT::i8)); - SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1, - DAG.getTargetConstant(SrlAmt, DL, MVT::i8)); - SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL); - return DAG.getBitcast(VT, Rot); - } + return -1; +} - SDValue Rot = - DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1), - DAG.getTargetConstant(RotateAmtInBits, DL, MVT::i8)); +/// Lower shuffle using X86ISD::VROTLI rotations. +static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, + ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + // Only XOP + AVX512 targets have bit rotation instructions. + // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this. + bool IsLegal = + (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512(); + if (!IsLegal && Subtarget.hasSSE3()) + return SDValue(); + + MVT RotateVT; + int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(), + Subtarget, Mask); + if (RotateAmt < 0) + return SDValue(); + + // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL, + // expanded to OR(SRL,SHL), will be more efficient, but if they can + // widen to vXi16 or more then existing lowering should will be better. + if (!IsLegal) { + if ((RotateAmt % 16) == 0) + return SDValue(); + // TODO: Use getTargetVShiftByConstNode. + unsigned ShlAmt = RotateAmt; + unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt; + V1 = DAG.getBitcast(RotateVT, V1); + SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1, + DAG.getTargetConstant(ShlAmt, DL, MVT::i8)); + SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1, + DAG.getTargetConstant(SrlAmt, DL, MVT::i8)); + SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL); return DAG.getBitcast(VT, Rot); } - return SDValue(); + SDValue Rot = + DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1), + DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); + return DAG.getBitcast(VT, Rot); } /// Try to lower a vector shuffle as a byte rotation. @@ -33538,6 +33545,19 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, } } + // Attempt to match against bit rotates. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && + ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || + Subtarget.hasAVX512())) { + int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, + Subtarget, Mask); + if (0 < RotateAmt) { + Shuffle = X86ISD::VROTLI; + PermuteImm = (unsigned)RotateAmt; + return true; + } + } + return false; } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 7a8a7d326238..d922de739c9d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -464,10 +464,17 @@ define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) { } define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) { -; CHECK-LABEL: combine_pshufb_not_as_pshufw: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29] -; CHECK-NEXT: ret{{[l|q]}} +; AVX2-LABEL: combine_pshufb_not_as_pshufw: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_pshufb_not_as_pshufw: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vprold $16, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: ret{{[l|q]}} %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> ) ret <32 x i8> %res1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 16a174c0b825..32709b4fd5dc 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -403,10 +403,23 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) { ; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pshufb_not_as_pshufw: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_not_as_pshufw: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_not_as_pshufw: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_not_as_pshufw: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vprold $16, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> ) ret <16 x i8> %res1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 5264ab101c4a..9c507ad5443e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -255,7 +255,7 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) { define <16 x i8> @combine_vpperm_as_proti_v8i16(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_as_proti_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vprotw $8, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) ret <16 x i8> %res0