[X86][AVX] lowerShuffleWithPERMV - pad 128/256-bit shuffles on non-VLX targets

Allow non-VLX targets to use 512-bits VPERMV/VPERMV3 for 128/256-bit shuffles.

TBH I'm not sure these targets actually exist in the wild, but we're testing for them and its good test coverage for shuffle lowering/combines across different subvector widths.
This commit is contained in:
Simon Pilgrim 2020-08-18 15:46:02 +01:00
parent 011bf4fd96
commit abd33bf5ef
3 changed files with 71 additions and 87 deletions

View File

@ -14969,17 +14969,35 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG);
}
// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
// the active subvector is extracted.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
ArrayRef<int> Mask, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
MVT ShuffleVT = VT;
if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
ShuffleVT = V1.getSimpleValueType();
}
SDValue Result;
if (V2.isUndef())
Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
else
Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
if (VT != ShuffleVT)
Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
return Result;
}
/// Generic lowering of v16i8 shuffles.
@ -15208,9 +15226,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
DAG);
// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
if (Subtarget.hasXOP()) {
@ -16964,9 +16983,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512BWVL can lower to VPERMW.
if (Subtarget.hasBWI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
if (Subtarget.hasBWI())
return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@ -17069,9 +17088,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512VBMIVL can lower to VPERMB.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@ -17325,7 +17344,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
@ -17384,7 +17403,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
@ -17447,7 +17466,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
@ -17524,7 +17543,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
@ -17587,7 +17606,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
@ -17643,7 +17662,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.

View File

@ -85,12 +85,10 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
;
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@ -260,20 +258,11 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
@ -327,20 +316,11 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
@ -394,20 +374,11 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:

View File

@ -328,8 +328,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
@ -413,8 +413,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
@ -457,13 +457,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
;
; AVX512BW-LABEL: PR34175:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512BW-NEXT: retq
;
@ -478,13 +475,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
;
; AVX512VBMI-LABEL: PR34175:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
; AVX512VBMI-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMI-NEXT: retq
;