[X86] Fold SHUFPS(shuffle(x),shuffle(y),mask) -> SHUFPS(x,y,mask')

We can combine unary shuffles into either of SHUFPS's inputs and adjust the shuffle mask accordingly.

Unlike general shuffle combining, we can be more aggressive and handle multiuse cases as we're not going to accidentally create additional shuffles.
This commit is contained in:
Simon Pilgrim 2021-09-19 18:41:55 +01:00
parent 2ca637c976
commit b7342e3137
3 changed files with 40 additions and 6 deletions

View File

@ -38804,6 +38804,41 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
case X86ISD::SHUFP: {
// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
// This is a more relaxed shuffle combiner that can ignore oneuse limits.
// TODO: Support types other than v4f32.
if (VT == MVT::v4f32) {
bool Updated = false;
SmallVector<int> Mask;
SmallVector<SDValue> Ops;
if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
Ops.size() == 2) {
for (int i = 0; i != 2; ++i) {
SmallVector<SDValue> SubOps;
SmallVector<int> SubMask, SubScaledMask;
SDValue Sub = peekThroughBitcasts(Ops[i]);
// TODO: Scaling might be easier if we specify the demanded elts.
if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
scaleShuffleElements(SubMask, 4, SubScaledMask) &&
SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
int Ofs = i * 2;
Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
Ops[i] = DAG.getBitcast(VT, SubOps[0]);
Updated = true;
}
}
}
if (Updated) {
for (int &M : Mask)
M %= 4;
Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
}
}
return SDValue();
}
case X86ISD::VPERMI: {
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

View File

@ -56,7 +56,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]

View File

@ -568,13 +568,12 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: simplify_select: