From ee0333b4a9d5baf3165f09cd6fb4b520fae620e6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 16 Nov 2018 22:53:00 +0000 Subject: [PATCH] [X86] Add custom promotion of narrow fp_to_uint/fp_to_sint operations under -x86-experimental-vector-widening-legalization. This tries to force the result type to vXi32 followed by a truncate. This can help avoid scalarization that would otherwise occur. There's some annoying examples of an avx512 truncate instruction followed by a packus where we should really be able to just use one truncate. But overall this is still a net improvement. llvm-svn: 347105 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 51 ++- llvm/test/CodeGen/X86/avx512-cvt-widen.ll | 42 +-- llvm/test/CodeGen/X86/vec_cast2.ll | 94 +----- llvm/test/CodeGen/X86/vec_cast3.ll | 28 +- llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll | 334 ++++--------------- 5 files changed, 145 insertions(+), 404 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5c07d7d906f1..8f422ce524be 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -899,10 +899,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); - // Custom legalize these to avoid over promotion. + + // Custom legalize these to avoid over promotion or custom promotion. setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); @@ -26287,7 +26295,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // Promote these manually to avoid over promotion to v2i64. Type // legalization will revisit the v2i32 operation for more cleanup. if ((VT == MVT::v2i8 || VT == MVT::v2i16) && - getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { // AVX512DQ provides instructions that produce a v2i64 result. if (Subtarget.hasDQI()) return; @@ -26302,6 +26310,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (VT.isVector() && VT.getScalarSizeInBits() < 32) { + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; + + // Try to create a 128 bit vector, but don't exceed a 32 bit element. + unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); + MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), + VT.getVectorNumElements()); + unsigned Opc = N->getOpcode(); + if (PromoteVT == MVT::v2i32 || PromoteVT == MVT::v4i32) + Opc = ISD::FP_TO_SINT; + + SDValue Res = DAG.getNode(Opc, dl, PromoteVT, Src); + + // Preserve what we know about the size of the original result. Except + // when the result is v2i32 since we can't widen the assert. + if (PromoteVT != MVT::v2i32) + Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext + : ISD::AssertSext, + dl, PromoteVT, Res, + DAG.getValueType(VT.getVectorElementType())); + + // Truncate back to the original width. + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + + // Now widen to 128 bits. + unsigned NumConcats = 128 / VT.getSizeInBits(); + MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), + VT.getVectorNumElements() * NumConcats); + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); + Results.push_back(Res); + return; + } + + if (VT == MVT::v2i32) { assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); diff --git a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll index a4d9a8272a38..82681ec7d418 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll @@ -502,33 +502,21 @@ define <8 x i16> @f64to8us(<8 x double> %f) { } define <8 x i8> @f64to8uc(<8 x double> %f) { -; ALL-LABEL: f64to8uc: -; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvttsd2si %xmm1, %eax -; ALL-NEXT: vcvttsd2si %xmm0, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; ALL-NEXT: vcvttsd2si %xmm0, %eax -; ALL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvttsd2si %xmm0, %eax -; ALL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; NOVL-LABEL: f64to8uc: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8uc: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index 1bc4b690487f..d746bf591210 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -172,29 +172,10 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v8f32_v8i8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> @@ -229,17 +210,8 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4i8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -253,11 +225,8 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4i16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i16> ret <4 x i16> %res @@ -274,29 +243,10 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v8f32_v8u8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl %res = fptoui <8 x float> %src to <8 x i8> @@ -331,17 +281,8 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4u8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -355,11 +296,8 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4u16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i16> ret <4 x i16> %res diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll index e8662b8cc34d..e4e6aa52ff56 100644 --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -117,11 +117,8 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm0 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -136,11 +133,8 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res @@ -170,11 +164,8 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm0 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -189,11 +180,8 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll index f4e0147e3f1a..47078daa9259 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll @@ -2310,31 +2310,17 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f32_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm1, %eax -; VEX-NEXT: vcvttss2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f32_to_2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2342,64 +2328,15 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptosi_2f32_to_2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_2f32_to_2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f32_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f32_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f32_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2407,31 +2344,17 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f32_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm1, %eax -; VEX-NEXT: vcvttss2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_2f32_to_2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2439,64 +2362,15 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_2f32_to_2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f32_to_2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2504,22 +2378,16 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm1, %eax -; AVX-NEXT: vcvttsd2si %xmm0, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i8> ret <2 x i8> %cvt @@ -2528,55 +2396,15 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f64_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; VEX-NEXT: vcvttsd2si %xmm1, %eax -; VEX-NEXT: vcvttsd2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f64_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2584,22 +2412,16 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fptoui_2f64_to_2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm1, %eax -; AVX-NEXT: vcvttsd2si %xmm0, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i8> ret <2 x i8> %cvt @@ -2608,55 +2430,15 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; VEX-NEXT: vcvttsd2si %xmm1, %eax -; VEX-NEXT: vcvttsd2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt }