diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a1600230cc9b..6c8def1c397d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25094,8 +25094,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && FloatDomain) { - // On SSE41 targets use BLENDPD (its commutable). - if (Subtarget.hasSSE2() && !Subtarget.hasSSE41()) { + if (Subtarget.hasSSE2()) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; ShuffleVT = MVT::v2f64; @@ -25103,12 +25102,9 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && FloatDomain) { - // On SSE41 targets use BLENDPS (its commutable). - if (!Subtarget.hasSSE41()) { - Shuffle = X86ISD::MOVSS; - ShuffleVT = MVT::v4f32; - return true; - } + Shuffle = X86ISD::MOVSS; + ShuffleVT = MVT::v4f32; + return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { V2 = V1; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 950fbc5e4025..30437afb15bb 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8535,6 +8535,13 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all, // patterns we have to try to match. multiclass AVX512_scalar_math_f32_patterns { let Predicates = [HasAVX512] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 056479c71244..afbb1d3a1730 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3217,9 +3217,15 @@ multiclass scalar_math_f32_patterns { } - // Repeat everything for AVX, except for the movss + scalar combo... - // because that one shouldn't occur with AVX codegen? + // Repeat everything for AVX. let Predicates = [UseAVX] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 1fe5b54d2bbf..228e78275376 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -91,11 +91,17 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a) { ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test_sqrt_ss: -; AVX: # BB#0: -; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test_sqrt_ss: +; AVX1: # BB#0: +; AVX1-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_sqrt_ss: +; AVX512: # BB#0: +; AVX512-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = call float @llvm.sqrt.f32(float %1) %3 = insertelement <4 x float> %a, float %2, i32 0 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 3cb754c8f93f..4a009023a7bd 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -177,7 +177,8 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { ; X32-LABEL: insertps_or_blendps: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_or_blendps: diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index 309fa98145c6..e07d7c3e8c7a 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -729,8 +729,8 @@ define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) ; ; SSE41-LABEL: blend_shufflevector_8xfloat: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: blend_shufflevector_8xfloat: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 6ef383c858e5..71bee75507c2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -246,10 +246,20 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_03: -; AVX: # BB#0: -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_03: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_03: +; AVX2: # BB#0: +; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_03: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -274,10 +284,20 @@ define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_21: -; AVX: # BB#0: -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_21: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_21: +; AVX2: # BB#0: +; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_21: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index c286fd426b14..16b2efe88a4b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -49,16 +49,23 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) ; ; SSE41-LABEL: combine_pshufb_as_movsd: ; SSE41: # BB#0: -; SSE41-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_movsd: -; AVX: # BB#0: -; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1],xmm0[0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_movsd: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_movsd: +; AVX2: # BB#0: +; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_movsd: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX512F-NEXT: retq %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = bitcast <2 x double> %1 to <16 x i8> %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) @@ -77,10 +84,20 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) { ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_movss: -; AVX: # BB#0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_movss: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_movss: +; AVX2: # BB#0: +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_movss: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512F-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = bitcast <4 x float> %1 to <16 x i8> %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 121fafc84c5e..c1d4446023d4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -262,15 +262,13 @@ define <2 x double> @constant_fold_vpermil2pd() { ; X32-LABEL: constant_fold_vpermil2pd: ; X32: # BB#0: ; X32-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00] -; X32-NEXT: vpermil2pd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; X32-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2pd: ; X64: # BB#0: ; X64-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00] -; X64-NEXT: vpermil2pd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; X64-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] ; X64-NEXT: retq %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> , <2 x double> , <2 x i64> , i8 2) ret <2 x double> %1