[X86][AVX] Combine 128-bit lane shuffles with a zeroable upper half to EXTRACT_SUBVECTOR (PR40720)

As explained on PR40720, EXTRACTF128 is always as good/better than VPERM2F128, and we can use the implicit zeroing of the upper half.

I've added some extra tests to vector-shuffle-combining-avx2.ll to make sure we don't lose coverage.
This commit is contained in:
Simon Pilgrim 2020-03-29 16:41:37 +01:00
parent da4c7db793
commit 7734e4b3a3
4 changed files with 100 additions and 63 deletions

View File

@ -33963,9 +33963,24 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Handle 128-bit lane shuffles of 256-bit vectors.
if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
// If the upper half is zeroable, then an extract+insert is more optimal
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
// zero the upper half.
if (isUndefOrZero(BaseMask[1])) {
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
Res = DAG.getBitcast(ShuffleVT, V1);
Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
DL, 256);
return DAG.getBitcast(RootVT, Res);
}
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
// we need to use the zeroing feature.
@ -46588,7 +46603,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
(IdxVal != 0 || !Vec.isUndef())) {
(IdxVal != 0 ||
!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
int VecNumElts = OpVT.getVectorNumElements();

View File

@ -452,7 +452,7 @@ define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_23zz:
; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x double> %s
@ -460,7 +460,7 @@ define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_23zz_optsize:
; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x double> %s
@ -486,7 +486,7 @@ define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_67zz:
; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
ret <4 x double> %s
@ -494,7 +494,7 @@ define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_67zz_optsize:
; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
ret <4 x double> %s
@ -512,7 +512,7 @@ define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_67zz:
; AVX2: # %bb.0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>

View File

@ -2437,37 +2437,32 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512DQVL-LABEL: test_v64i8:
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm1
; AVX512DQVL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512DQVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm3, %xmm0
; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovd %xmm0, %eax
@ -2858,32 +2853,27 @@ define i8 @test_v128i8(<128 x i8> %a0) {
;
; AVX512DQVL-LABEL: test_v128i8:
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512DQVL-NEXT: vpand %ymm1, %ymm4, %ymm1
; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm4, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm4, %xmm1
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm4, %xmm0
; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm3, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1

View File

@ -73,16 +73,16 @@ define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
ret <32 x i8> %2
}
define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
; X86-LABEL: combine_permq_pshufb_as_vperm2i128:
define <4 x i64> @combine_permq_pshufb_as_vextracti128(<4 x i64> %a0) {
; X86-LABEL: combine_permq_pshufb_as_vextracti128:
; X86: # %bb.0:
; X86-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; X86-NEXT: vextracti128 $1, %ymm0, %xmm0
; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
; X64-LABEL: combine_permq_pshufb_as_vextracti128:
; X64: # %bb.0:
; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@ -93,6 +93,26 @@ define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
ret <4 x i64> %5
}
define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) {
; X86-LABEL: combine_permq_pshufb_as_vmovdqa:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm0
; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: combine_permq_pshufb_as_vmovdqa:
; X64: # %bb.0:
; X64-NEXT: vmovdqa %xmm0, %xmm0
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
%4 = bitcast <32 x i8> %3 to <4 x i64>
%5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
ret <4 x i64> %5
}
define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
; CHECK-LABEL: combine_as_vpermd:
; CHECK: # %bb.0:
@ -117,11 +137,10 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
ret <8 x float> %3
}
define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) {
; CHECK-LABEL: combine_permq_pshufb_as_vmovaps:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vmovaps %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
@ -129,6 +148,18 @@ define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
ret <32 x i8> %3
}
define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: ret{{[l|q]}}
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
ret <32 x i8> %3
}
define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
; CHECK: # %bb.0: