From 7734e4b3a36f233df493e6101086a9c95d309a40 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 29 Mar 2020 16:41:37 +0100 Subject: [PATCH] [X86][AVX] Combine 128-bit lane shuffles with a zeroable upper half to EXTRACT_SUBVECTOR (PR40720) As explained on PR40720, EXTRACTF128 is always as good/better than VPERM2F128, and we can use the implicit zeroing of the upper half. I've added some extra tests to vector-shuffle-combining-avx2.ll to make sure we don't lose coverage. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++- llvm/test/CodeGen/X86/avx-vperm2x128.ll | 10 +-- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 84 ++++++++----------- .../X86/vector-shuffle-combining-avx2.ll | 49 +++++++++-- 4 files changed, 100 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3bb9c68df782..dff13183aab2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33963,9 +33963,24 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // Handle 128-bit lane shuffles of 256-bit vectors. if (RootVT.is256BitVector() && NumBaseMaskElts == 2) { + MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); + + // If the upper half is zeroable, then an extract+insert is more optimal + // than using X86ISD::VPERM2X128. The insertion is free, even if it has to + // zero the upper half. + if (isUndefOrZero(BaseMask[1])) { + if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) + return SDValue(); // Nothing to do! + assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle"); + Res = DAG.getBitcast(ShuffleVT, V1); + Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL); + Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG, + DL, 256); + return DAG.getBitcast(RootVT, Res); + } + if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! - MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless // we need to use the zeroing feature. @@ -46588,7 +46603,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && - (IdxVal != 0 || !Vec.isUndef())) { + (IdxVal != 0 || + !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) { int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { int VecNumElts = OpVT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll index 24ac0988721d..63445066934e 100644 --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -452,7 +452,7 @@ define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { ; ALL-LABEL: shuffle_v4f64_23zz: ; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s @@ -460,7 +460,7 @@ define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { ; ALL-LABEL: shuffle_v4f64_23zz_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s @@ -486,7 +486,7 @@ define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { ; ALL-LABEL: shuffle_v4f64_67zz: ; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s @@ -494,7 +494,7 @@ define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { ; ALL-LABEL: shuffle_v4f64_67zz_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s @@ -512,7 +512,7 @@ define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_67zz: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %s = shufflevector <4 x i64> , <4 x i64> %a, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index ca78dad09768..45f80fd22d69 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -2437,37 +2437,32 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512DQVL-LABEL: test_v64i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm1 -; AVX512DQVL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax @@ -2858,32 +2853,27 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; ; AVX512DQVL-LABEL: test_v128i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm4, %xmm1 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm4, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index d922de739c9d..bbd5bcb29a98 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -73,16 +73,16 @@ define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) { ret <32 x i8> %2 } -define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) { -; X86-LABEL: combine_permq_pshufb_as_vperm2i128: +define <4 x i64> @combine_permq_pshufb_as_vextracti128(<4 x i64> %a0) { +; X86-LABEL: combine_permq_pshufb_as_vextracti128: ; X86: # %bb.0: -; X86-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; X86-NEXT: vextracti128 $1, %ymm0, %xmm0 ; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-NEXT: retl ; -; X64-LABEL: combine_permq_pshufb_as_vperm2i128: +; X64-LABEL: combine_permq_pshufb_as_vextracti128: ; X64: # %bb.0: -; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> @@ -93,6 +93,26 @@ define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) { ret <4 x i64> %5 } +define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) { +; X86-LABEL: combine_permq_pshufb_as_vmovdqa: +; X86: # %bb.0: +; X86-NEXT: vmovdqa %xmm0, %xmm0 +; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: combine_permq_pshufb_as_vmovdqa: +; X64: # %bb.0: +; X64-NEXT: vmovdqa %xmm0, %xmm0 +; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %2 = bitcast <4 x i64> %1 to <32 x i8> + %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> ) + %4 = bitcast <32 x i8> %3 to <4 x i64> + %5 = add <4 x i64> %4, + ret <4 x i64> %5 +} + define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: combine_as_vpermd: ; CHECK: # %bb.0: @@ -117,11 +137,10 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) { ret <8 x float> %3 } -define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { -; CHECK-LABEL: combine_permq_pshufb_as_vpblendd: +define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) { +; CHECK-LABEL: combine_permq_pshufb_as_vmovaps: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vmovaps %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> %2 = bitcast <4 x i64> %1 to <32 x i8> @@ -129,6 +148,18 @@ define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { ret <32 x i8> %3 } +define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { +; CHECK-LABEL: combine_permq_pshufb_as_vpblendd: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %2 = bitcast <4 x i64> %1 to <32 x i8> + %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> ) + ret <32 x i8> %3 +} + define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128: ; CHECK: # %bb.0: