forked from OSchip/llvm-project
[X86][AVX] lowerShuffleAsLanePermuteAndShuffle - don't split element rotate patterns
Partial element rotate patterns (e.g. for element insertion on Issue #53124) were being split if every lane wasn't crossing, but really there's a good repeated mask hiding in there.
This commit is contained in:
parent
971bd6f834
commit
57a551a8df
|
@ -16544,23 +16544,30 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
|
|||
lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
|
||||
return V;
|
||||
|
||||
// Always allow ElementRotate patterns - these are sometimes hidden but its
|
||||
// still better to avoid splitting.
|
||||
SDValue RotV1 = V1, RotV2 = V2;
|
||||
bool IsElementRotate = 0 <= matchShuffleAsElementRotate(RotV1, RotV2, Mask);
|
||||
|
||||
// If there are only inputs from one 128-bit lane, splitting will in fact be
|
||||
// less expensive. The flags track whether the given lane contains an element
|
||||
// that crosses to another lane.
|
||||
if (!Subtarget.hasAVX2()) {
|
||||
bool LaneCrossing[2] = {false, false};
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
|
||||
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
|
||||
if (!LaneCrossing[0] || !LaneCrossing[1])
|
||||
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
|
||||
} else {
|
||||
bool LaneUsed[2] = {false, false};
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0)
|
||||
LaneUsed[(Mask[i] % Size) / LaneSize] = true;
|
||||
if (!LaneUsed[0] || !LaneUsed[1])
|
||||
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
|
||||
if (!IsElementRotate) {
|
||||
if (!Subtarget.hasAVX2()) {
|
||||
bool LaneCrossing[2] = {false, false};
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
|
||||
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
|
||||
if (!LaneCrossing[0] || !LaneCrossing[1])
|
||||
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
|
||||
} else {
|
||||
bool LaneUsed[2] = {false, false};
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0)
|
||||
LaneUsed[(Mask[i] % Size) / LaneSize] = true;
|
||||
if (!LaneUsed[0] || !LaneUsed[1])
|
||||
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO - we could support shuffling V2 in the Flipped input.
|
||||
|
|
|
@ -1345,11 +1345,9 @@ define <8 x float> @shuffle_v8f32_01452367(<8 x float> %a) {
|
|||
define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_089abcde:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,2]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
||||
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,0,1,2]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -1402,11 +1400,9 @@ define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) {
|
|||
define <8 x float> @shuffle_v8f32_01289abc(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_01289abc:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
||||
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,3,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm1[2,0],ymm2[5,6],ymm1[6,4]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -2880,10 +2876,9 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
|
|||
define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_089abcde:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,2]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -2954,10 +2949,9 @@ define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) {
|
|||
define <8 x i32> @shuffle_v8i32_01289abc(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_01289abc:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm1[2,0],ymm2[5,6],ymm1[6,4]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue