diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 81b011c27d0a..3187c41a4ab1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38804,6 +38804,41 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::SHUFP: { + // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y). + // This is a more relaxed shuffle combiner that can ignore oneuse limits. + // TODO: Support types other than v4f32. + if (VT == MVT::v4f32) { + bool Updated = false; + SmallVector Mask; + SmallVector Ops; + if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) && + Ops.size() == 2) { + for (int i = 0; i != 2; ++i) { + SmallVector SubOps; + SmallVector SubMask, SubScaledMask; + SDValue Sub = peekThroughBitcasts(Ops[i]); + // TODO: Scaling might be easier if we specify the demanded elts. + if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) && + scaleShuffleElements(SubMask, 4, SubScaledMask) && + SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) { + int Ofs = i * 2; + Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4); + Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4); + Ops[i] = DAG.getBitcast(VT, SubOps[0]); + Updated = true; + } + } + } + if (Updated) { + for (int &M : Mask) + M %= 4; + Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops); + } + } + return SDValue(); + } case X86ISD::VPERMI: { // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. // TODO: Remove when we have preferred domains in combineX86ShuffleChain. diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 88691eedcab5..ad129547c3c4 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -56,7 +56,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index 0a8283ffe3d2..adeeb874c7bb 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,13 +568,12 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: