forked from OSchip/llvm-project
[X86] Fold SHUFPS(shuffle(x),shuffle(y),mask) -> SHUFPS(x,y,mask')
We can combine unary shuffles into either of SHUFPS's inputs and adjust the shuffle mask accordingly. Unlike general shuffle combining, we can be more aggressive and handle multiuse cases as we're not going to accidentally create additional shuffles.
This commit is contained in:
parent
2ca637c976
commit
b7342e3137
|
@ -38804,6 +38804,41 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
|
|||
}
|
||||
return SDValue();
|
||||
}
|
||||
case X86ISD::SHUFP: {
|
||||
// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
|
||||
// This is a more relaxed shuffle combiner that can ignore oneuse limits.
|
||||
// TODO: Support types other than v4f32.
|
||||
if (VT == MVT::v4f32) {
|
||||
bool Updated = false;
|
||||
SmallVector<int> Mask;
|
||||
SmallVector<SDValue> Ops;
|
||||
if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
|
||||
Ops.size() == 2) {
|
||||
for (int i = 0; i != 2; ++i) {
|
||||
SmallVector<SDValue> SubOps;
|
||||
SmallVector<int> SubMask, SubScaledMask;
|
||||
SDValue Sub = peekThroughBitcasts(Ops[i]);
|
||||
// TODO: Scaling might be easier if we specify the demanded elts.
|
||||
if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
|
||||
scaleShuffleElements(SubMask, 4, SubScaledMask) &&
|
||||
SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
|
||||
int Ofs = i * 2;
|
||||
Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
|
||||
Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
|
||||
Ops[i] = DAG.getBitcast(VT, SubOps[0]);
|
||||
Updated = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Updated) {
|
||||
for (int &M : Mask)
|
||||
M %= 4;
|
||||
Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
|
||||
return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
|
||||
}
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
case X86ISD::VPERMI: {
|
||||
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
|
||||
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
|
||||
|
|
|
@ -56,7 +56,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
|
||||
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
|
||||
|
|
|
@ -568,13 +568,12 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
|
|||
; SSE2-NEXT: psrad $31, %xmm0
|
||||
; SSE2-NEXT: movd %edi, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE2-NEXT: por %xmm1, %xmm3
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3]
|
||||
; SSE2-NEXT: por %xmm1, %xmm2
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1]
|
||||
; SSE2-NEXT: pand %xmm0, %xmm3
|
||||
; SSE2-NEXT: pand %xmm0, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm1, %xmm0
|
||||
; SSE2-NEXT: por %xmm3, %xmm0
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: simplify_select:
|
||||
|
|
Loading…
Reference in New Issue