diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b89383d2ce6d..4a8a09a28fd5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1250,17 +1250,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v16i1, Expand); if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); if (Subtarget.hasVLX()) { - setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); } } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9895a8f29f9b..a3e57fa58c7b 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6562,6 +6562,48 @@ let Predicates = [HasAVX512] in { (VCVTPS2PDZrm addr:$src)>; } +let Predicates = [HasDQI, NoVLX] in { +def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr + (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr + (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; +} + //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index f4ae6abcd808..6b6ac840fa59 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -716,6 +716,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { // potential massive combinations (elem_num x src_type x dst_type). static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, @@ -726,7 +728,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, diff --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll index 693efc2ffbfb..6a4bc3c21ea9 100644 --- a/llvm/test/Analysis/CostModel/X86/fptosi.ll +++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll @@ -23,7 +23,8 @@ define i32 @fptosi_double_i64(i32 %arg) { ; SSE42: cost of 13 {{.*}} %V4I64 = fptosi ; AVX1: cost of 12 {{.*}} %V4I64 = fptosi ; AVX2: cost of 12 {{.*}} %V4I64 = fptosi - ; AVX512: cost of 12 {{.*}} %V4I64 = fptosi + ; AVX512F: cost of 12 {{.*}} %V4I64 = fptosi + ; AVX512DQ: cost of 1 {{.*}} %V4I64 = fptosi %V4I64 = fptosi <4 x double> undef to <4 x i64> ; SSE2: cost of 27 {{.*}} %V8I64 = fptosi ; SSE42: cost of 27 {{.*}} %V8I64 = fptosi @@ -145,7 +146,8 @@ define i32 @fptosi_float_i64(i32 %arg) { ; SSE42: cost of 13 {{.*}} %V4I64 = fptosi ; AVX1: cost of 12 {{.*}} %V4I64 = fptosi ; AVX2: cost of 12 {{.*}} %V4I64 = fptosi - ; AVX512: cost of 12 {{.*}} %V4I64 = fptosi + ; AVX512F: cost of 12 {{.*}} %V4I64 = fptosi + ; AVX512DQ: cost of 1 {{.*}} %V4I64 = fptosi %V4I64 = fptosi <4 x float> undef to <4 x i64> ; SSE2: cost of 27 {{.*}} %V8I64 = fptosi ; SSE42: cost of 27 {{.*}} %V8I64 = fptosi diff --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll index 966a2c3ef504..371666141d0a 100644 --- a/llvm/test/Analysis/CostModel/X86/sitofp.ll +++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll @@ -110,7 +110,7 @@ define i32 @sitofp_i64_double() { ; AVX1: cost of 13 {{.*}} sitofp <4 x i64> ; AVX2: cost of 13 {{.*}} sitofp <4 x i64> ; AVX512F: cost of 13 {{.*}} sitofp <4 x i64> - ; AVX512DQ: cost of 13 {{.*}} sitofp <4 x i64> + ; AVX512DQ: cost of 1 {{.*}} sitofp <4 x i64> %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> ; SSE2: cost of 80 {{.*}} sitofp <8 x i64> @@ -229,7 +229,7 @@ define i32 @sitofp_i64_float() { ; AVX1: cost of 10 {{.*}} sitofp <4 x i64> ; AVX2: cost of 10 {{.*}} sitofp <4 x i64> ; AVX512F: cost of 10 {{.*}} sitofp <4 x i64> - ; AVX512DQ: cost of 10 {{.*}} sitofp <4 x i64> + ; AVX512DQ: cost of 1 {{.*}} sitofp <4 x i64> %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> ; SSE2: cost of 60 {{.*}} sitofp <8 x i64> diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index bd41fafe3bb1..e3c711d3a50f 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -225,20 +225,9 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; ; AVX512DQ-LABEL: fptosi_4f64_to_4i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vcvttsd2si %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512DQ-NEXT: vcvttsd2si %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-NEXT: vcvttsd2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512DQ-NEXT: vcvttsd2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64: @@ -755,20 +744,9 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; ; AVX512DQ-LABEL: fptoui_4f64_to_4i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vcvttsd2usi %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512DQ-NEXT: vcvttsd2usi %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64: @@ -972,12 +950,9 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { ; ; AVX512DQ-LABEL: fptosi_4f32_to_2i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rcx -; AVX512DQ-NEXT: vmovq %rcx, %xmm0 -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64: @@ -1100,20 +1075,8 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; ; AVX512DQ-LABEL: fptosi_4f32_to_4i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512DQ-NEXT: vcvttss2si %xmm2, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64: @@ -1511,12 +1474,9 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { ; ; AVX512DQ-LABEL: fptoui_4f32_to_2i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2usi %xmm1, %rax -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512DQ-NEXT: vmovq %rcx, %xmm0 -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64: @@ -1815,20 +1775,8 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; ; AVX512DQ-LABEL: fptoui_4f32_to_4i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512DQ-NEXT: vcvttss2usi %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512DQ-NEXT: vcvttss2usi %xmm2, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm2 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index bbcd228f3d25..66fa03dd3baf 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -306,18 +306,9 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm1, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64: @@ -865,18 +856,9 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm1, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64: @@ -1134,14 +1116,9 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; ; AVX512DQ-LABEL: sitofp_2i64_to_4f32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32: @@ -1208,14 +1185,9 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef: @@ -1445,18 +1417,9 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32: @@ -1710,14 +1673,9 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; ; AVX512DQ-LABEL: uitofp_2i64_to_4f32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32: @@ -1846,14 +1804,9 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef: @@ -2277,18 +2230,9 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32: @@ -2725,19 +2669,9 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm1, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64: @@ -3161,19 +3095,9 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm1, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64: @@ -3410,19 +3334,9 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32: @@ -4007,19 +3921,9 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll index e1efe63abb78..3d472bb2c208 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -39,20 +39,41 @@ define void @sitofp_2i64_2f64() #0 { } define void @sitofp_4i64_4f64() #0 { -; CHECK-LABEL: @sitofp_4i64_4f64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double -; CHECK-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double -; CHECK-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; CHECK-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_4i64_4f64( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double +; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double +; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double +; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double +; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 +; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: ret void +; +; AVX256-LABEL: @sitofp_4i64_4f64( +; AVX256-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; AVX256-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; AVX256-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to double +; AVX256-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to double +; AVX256-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to double +; AVX256-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to double +; AVX256-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 +; AVX256-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; AVX256-NEXT: ret void +; +; AVX512-LABEL: @sitofp_4i64_4f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX512-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX512-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -559,20 +580,41 @@ define void @sitofp_2i64_2f32() #0 { } define void @sitofp_4i64_4f32() #0 { -; CHECK-LABEL: @sitofp_4i64_4f32( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; CHECK-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; CHECK-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; CHECK-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; CHECK-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_4i64_4f32( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; SSE-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float +; SSE-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: ret void +; +; AVX256-LABEL: @sitofp_4i64_4f32( +; AVX256-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; AVX256-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; AVX256-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; AVX256-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; AVX256-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float +; AVX256-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float +; AVX256-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; AVX256-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256-NEXT: ret void +; +; AVX512-LABEL: @sitofp_4i64_4f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX512-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8