From b7342e3137d8fa7c356a80c1ddecf1d410c27eef Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 19 Sep 2021 18:41:55 +0100 Subject: [PATCH] [X86] Fold SHUFPS(shuffle(x),shuffle(y),mask) -> SHUFPS(x,y,mask') We can combine unary shuffles into either of SHUFPS's inputs and adjust the shuffle mask accordingly. Unlike general shuffle combining, we can be more aggressive and handle multiuse cases as we're not going to accidentally create additional shuffles. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 35 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/horizontal-sum.ll | 2 +- llvm/test/CodeGen/X86/vselect.ll | 9 +++---- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 81b011c27d0a..3187c41a4ab1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38804,6 +38804,41 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::SHUFP: { + // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y). + // This is a more relaxed shuffle combiner that can ignore oneuse limits. + // TODO: Support types other than v4f32. + if (VT == MVT::v4f32) { + bool Updated = false; + SmallVector Mask; + SmallVector Ops; + if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) && + Ops.size() == 2) { + for (int i = 0; i != 2; ++i) { + SmallVector SubOps; + SmallVector SubMask, SubScaledMask; + SDValue Sub = peekThroughBitcasts(Ops[i]); + // TODO: Scaling might be easier if we specify the demanded elts. + if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) && + scaleShuffleElements(SubMask, 4, SubScaledMask) && + SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) { + int Ofs = i * 2; + Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4); + Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4); + Ops[i] = DAG.getBitcast(VT, SubOps[0]); + Updated = true; + } + } + } + if (Updated) { + for (int &M : Mask) + M %= 4; + Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops); + } + } + return SDValue(); + } case X86ISD::VPERMI: { // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. // TODO: Remove when we have preferred domains in combineX86ShuffleChain. diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 88691eedcab5..ad129547c3c4 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -56,7 +56,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index 0a8283ffe3d2..adeeb874c7bb 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,13 +568,12 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: