[X86][AVX] lowerShuffleAsLanePermuteAndShuffle - don't split element rotate patterns

Partial element rotate patterns (e.g. for element insertion on Issue #53124) were being split if every lane wasn't crossing, but really there's a good repeated mask hiding in there.
This commit is contained in:
Simon Pilgrim 2022-01-13 11:58:57 +00:00
parent 971bd6f834
commit 57a551a8df
2 changed files with 33 additions and 32 deletions

View File

@ -16544,23 +16544,30 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
return V;
// Always allow ElementRotate patterns - these are sometimes hidden but its
// still better to avoid splitting.
SDValue RotV1 = V1, RotV2 = V2;
bool IsElementRotate = 0 <= matchShuffleAsElementRotate(RotV1, RotV2, Mask);
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
if (!Subtarget.hasAVX2()) {
bool LaneCrossing[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] % Size) / LaneSize] = true;
if (!LaneUsed[0] || !LaneUsed[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
if (!IsElementRotate) {
if (!Subtarget.hasAVX2()) {
bool LaneCrossing[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] % Size) / LaneSize] = true;
if (!LaneUsed[0] || !LaneUsed[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
}
// TODO - we could support shuffling V2 in the Flipped input.

View File

@ -1345,11 +1345,9 @@ define <8 x float> @shuffle_v8f32_01452367(<8 x float> %a) {
define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_089abcde:
; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,2]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,0,1,2]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
@ -1402,11 +1400,9 @@ define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01289abc(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_01289abc:
; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,3,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm1[2,0],ymm2[5,6],ymm1[6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX1-NEXT: retq
;
@ -2880,10 +2876,9 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_089abcde:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,2]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
@ -2954,10 +2949,9 @@ define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01289abc(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_01289abc:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm1[2,0],ymm2[5,6],ymm1[6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX1-NEXT: retq
;