diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1f352ae47726..8a63163cade7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33266,12 +33266,15 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { // the result is the same as the high half. If a target shuffle is also // replicating low and high halves, we don't need the shuffle. if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { - // movddup (hadd X, X) --> hadd X, X - // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X - assert((HOp.getValueType() == MVT::v2f64 || - HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && - "Unexpected type for h-op"); - return HOp; + if (HOp.getScalarValueSizeInBits() == 64) { + // movddup (hadd X, X) --> hadd X, X + // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X + assert((HOp.getValueType() == MVT::v2f64 || + HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && + "Unexpected type for h-op"); + return HOp; + } + return SDValue(); } // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index b5b7cfde8ae2..0787269072b2 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -700,3 +700,28 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> ret <16 x i16> %shuf } + +define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { +; SSSE3-LABEL: broadcast_haddps_v4f32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: haddps %xmm0, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-NEXT: retq +; +; AVX1-LABEL: broadcast_haddps_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: broadcast_haddps_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0) + %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %2 +} + +declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)