[X86][AVX] lowerShuffleWithPERMV - adjust binary shuffle masks to account for widening on non-VLX targets

rGabd33bf5eff2 enabled us to pad 128/256-bit shuffles to 512-bit on non-VLX targets, but wasn't updating binary shuffles to account for the new vector width.
This commit is contained in:
Simon Pilgrim 2020-09-06 14:52:06 +01:00
parent 667e800bb3
commit ecac5c2808
3 changed files with 36 additions and 19 deletions

View File

@ -14951,16 +14951,27 @@ static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1, SDValue V2, ArrayRef<int> Mask, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget, const X86Subtarget &Subtarget,
SelectionDAG &DAG) { SelectionDAG &DAG) {
int NumElts = VT.getVectorNumElements();
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, NumElts);
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
SDValue MaskNode;
MVT ShuffleVT = VT; MVT ShuffleVT = VT;
if (!VT.is512BitVector() && !Subtarget.hasVLX()) { if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512); V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512); V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
ShuffleVT = V1.getSimpleValueType(); ShuffleVT = V1.getSimpleValueType();
// Adjust mask to correct indices for the second input.
unsigned Scale = 512 / VT.getSizeInBits();
SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
for (int &M : AdjustedMask)
if (NumElts <= M)
M += (Scale - 1) * NumElts;
MaskNode = getConstVector(AdjustedMask, MaskVecVT, DAG, DL, true);
MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
} else {
MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
} }
SDValue Result; SDValue Result;

View File

@ -85,9 +85,10 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
; ;
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1: ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,33,35,37,39,9,11,13,15,41,43,45,47]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: vzeroupper
@ -258,9 +259,10 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; ;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <1,5,9,13,33,37,41,45,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq ; AVX512BW-NEXT: retq
@ -316,9 +318,10 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; ;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,10,14,34,38,42,46,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq ; AVX512BW-NEXT: retq
@ -374,9 +377,10 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; ;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3: ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,7,11,15,35,39,43,47,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq ; AVX512BW-NEXT: retq

View File

@ -327,8 +327,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
; ;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0: ; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq ; AVX512VBMI-NEXT: retq
@ -412,8 +412,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
; ;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0: ; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq ; AVX512VBMI-NEXT: retq
@ -455,9 +455,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
; ;
; AVX512BW-LABEL: PR34175: ; AVX512BW-LABEL: PR34175:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512BW-NEXT: retq ; AVX512BW-NEXT: retq
@ -473,9 +474,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
; ;
; AVX512VBMI-LABEL: PR34175: ; AVX512VBMI-LABEL: PR34175:
; AVX512VBMI: # %bb.0: ; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
; AVX512VBMI-NEXT: vpermt2w %zmm0, %zmm0, %zmm1 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMI-NEXT: retq ; AVX512VBMI-NEXT: retq