[X86] Add custom promotion of narrow fp_to_uint/fp_to_sint operations under -x86-experimental-vector-widening-legalization.

This tries to force the result type to vXi32 followed by a truncate. This can help avoid scalarization that would otherwise occur.

There's some annoying examples of an avx512 truncate instruction followed by a packus where we should really be able to just use one truncate. But overall this is still a net improvement.

llvm-svn: 347105
This commit is contained in:
Craig Topper 2018-11-16 22:53:00 +00:00
parent ac35cd330a
commit ee0333b4a9
5 changed files with 145 additions and 404 deletions

View File

@ -899,10 +899,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
// Custom legalize these to avoid over promotion.
// Custom legalize these to avoid over promotion or custom promotion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
@ -26287,7 +26295,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// Promote these manually to avoid over promotion to v2i64. Type
// legalization will revisit the v2i32 operation for more cleanup.
if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
// AVX512DQ provides instructions that produce a v2i64 result.
if (Subtarget.hasDQI())
return;
@ -26302,6 +26310,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
return;
// Try to create a 128 bit vector, but don't exceed a 32 bit element.
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
VT.getVectorNumElements());
unsigned Opc = N->getOpcode();
if (PromoteVT == MVT::v2i32 || PromoteVT == MVT::v4i32)
Opc = ISD::FP_TO_SINT;
SDValue Res = DAG.getNode(Opc, dl, PromoteVT, Src);
// Preserve what we know about the size of the original result. Except
// when the result is v2i32 since we can't widen the assert.
if (PromoteVT != MVT::v2i32)
Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
: ISD::AssertSext,
dl, PromoteVT, Res,
DAG.getValueType(VT.getVectorElementType()));
// Truncate back to the original width.
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
// Now widen to 128 bits.
unsigned NumConcats = 128 / VT.getSizeInBits();
MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
VT.getVectorNumElements() * NumConcats);
SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
Results.push_back(Res);
return;
}
if (VT == MVT::v2i32) {
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");

View File

@ -502,33 +502,21 @@ define <8 x i16> @f64to8us(<8 x double> %f) {
}
define <8 x i8> @f64to8uc(<8 x double> %f) {
; ALL-LABEL: f64to8uc:
; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; ALL-NEXT: vcvttsd2si %xmm1, %eax
; ALL-NEXT: vcvttsd2si %xmm0, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
; ALL-NEXT: vcvttsd2si %xmm2, %eax
; ALL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
; ALL-NEXT: vcvttsd2si %xmm2, %eax
; ALL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; ALL-NEXT: vcvttsd2si %xmm2, %eax
; ALL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
; ALL-NEXT: vcvttsd2si %xmm2, %eax
; ALL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; ALL-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; ALL-NEXT: vcvttsd2si %xmm0, %eax
; ALL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; ALL-NEXT: vcvttsd2si %xmm0, %eax
; ALL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
; NOVL-LABEL: f64to8uc:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
; NOVL-NEXT: vpmovdw %zmm0, %ymm0
; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to8uc:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; VL-NEXT: vpmovdw %ymm0, %xmm0
; VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%res = fptoui <8 x double> %f to <8 x i8>
ret <8 x i8> %res
}

View File

@ -172,29 +172,10 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v8f32_v8i8:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i8>
@ -229,17 +210,8 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4i8:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i8>
ret <4 x i8> %res
@ -253,11 +225,8 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4i16:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i16>
ret <4 x i16> %res
@ -274,29 +243,10 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v8f32_v8u8:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: retl
%res = fptoui <8 x float> %src to <8 x i8>
@ -331,17 +281,8 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4u8:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-WIDE-NEXT: retl
%res = fptoui <4 x float> %src to <4 x i8>
ret <4 x i8> %res
@ -355,11 +296,8 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4u16:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; CHECK-WIDE-NEXT: retl
%res = fptoui <4 x float> %src to <4 x i16>
ret <4 x i16> %res

View File

@ -117,11 +117,8 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: vmovd %ecx, %xmm0
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-WIDE-NEXT: retl
%res = fptosi <2 x float> %src to <2 x i8>
ret <2 x i8> %res
@ -136,11 +133,8 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-WIDE-NEXT: retl
%res = fptosi <2 x float> %src to <2 x i16>
ret <2 x i16> %res
@ -170,11 +164,8 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: vmovd %ecx, %xmm0
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-WIDE-NEXT: retl
%res = fptoui <2 x float> %src to <2 x i8>
ret <2 x i8> %res
@ -189,11 +180,8 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) {
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-WIDE-NEXT: retl
%res = fptoui <2 x float> %src to <2 x i16>
ret <2 x i16> %res

View File

@ -2310,31 +2310,17 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
; SSE-LABEL: fptosi_2f32_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %ecx
; SSE-NEXT: shll $8, %ecx
; SSE-NEXT: orl %eax, %ecx
; SSE-NEXT: movd %ecx, %xmm0
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f32_to_2i8:
; VEX: # %bb.0:
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; VEX-NEXT: vcvttss2si %xmm1, %eax
; VEX-NEXT: vcvttss2si %xmm0, %ecx
; VEX-NEXT: vmovd %ecx, %xmm0
; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512-LABEL: fptosi_2f32_to_2i8:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-LABEL: fptosi_2f32_to_2i8:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
@ -2342,64 +2328,15 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
; SSE-LABEL: fptosi_2f32_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %ecx
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pinsrw $1, %ecx, %xmm0
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_2f32_to_2i16:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptosi_2f32_to_2i16:
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptosi_2f32_to_2i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_2f32_to_2i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_2f32_to_2i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
; AVX-LABEL: fptosi_2f32_to_2i16:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i16>
ret <2 x i16> %cvt
}
@ -2407,31 +2344,17 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
; SSE-LABEL: fptoui_2f32_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %ecx
; SSE-NEXT: shll $8, %ecx
; SSE-NEXT: orl %eax, %ecx
; SSE-NEXT: movd %ecx, %xmm0
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f32_to_2i8:
; VEX: # %bb.0:
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; VEX-NEXT: vcvttss2si %xmm1, %eax
; VEX-NEXT: vcvttss2si %xmm0, %ecx
; VEX-NEXT: vmovd %ecx, %xmm0
; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512-LABEL: fptoui_2f32_to_2i8:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-LABEL: fptoui_2f32_to_2i8:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
@ -2439,64 +2362,15 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
; SSE-LABEL: fptoui_2f32_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %ecx
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pinsrw $1, %ecx, %xmm0
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_2f32_to_2i16:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_2f32_to_2i16:
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f32_to_2i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f32_to_2i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
; AVX-LABEL: fptoui_2f32_to_2i16:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i16>
ret <2 x i16> %cvt
}
@ -2504,22 +2378,16 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %ecx
; SSE-NEXT: shll $8, %ecx
; SSE-NEXT: orl %eax, %ecx
; SSE-NEXT: movd %ecx, %xmm0
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i8:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vcvttsd2si %xmm1, %eax
; AVX-NEXT: vcvttsd2si %xmm0, %ecx
; AVX-NEXT: vmovd %ecx, %xmm0
; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
@ -2528,55 +2396,15 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %ecx
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pinsrw $1, %ecx, %xmm0
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f64_to_2i16:
; VEX: # %bb.0:
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; VEX-NEXT: vcvttsd2si %xmm1, %eax
; VEX-NEXT: vcvttsd2si %xmm0, %ecx
; VEX-NEXT: vmovd %ecx, %xmm0
; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptosi_2f64_to_2i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_2f64_to_2i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_2f64_to_2i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
; AVX-LABEL: fptosi_2f64_to_2i16:
; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i16>
ret <2 x i16> %cvt
}
@ -2584,22 +2412,16 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %ecx
; SSE-NEXT: shll $8, %ecx
; SSE-NEXT: orl %eax, %ecx
; SSE-NEXT: movd %ecx, %xmm0
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_2f64_to_2i8:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vcvttsd2si %xmm1, %eax
; AVX-NEXT: vcvttsd2si %xmm0, %ecx
; AVX-NEXT: vmovd %ecx, %xmm0
; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
@ -2608,55 +2430,15 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %ecx
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pinsrw $1, %ecx, %xmm0
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f64_to_2i16:
; VEX: # %bb.0:
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; VEX-NEXT: vcvttsd2si %xmm1, %eax
; VEX-NEXT: vcvttsd2si %xmm0, %ecx
; VEX-NEXT: vmovd %ecx, %xmm0
; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_2i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_2i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f64_to_2i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
; AVX-LABEL: fptoui_2f64_to_2i16:
; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i16>
ret <2 x i16> %cvt
}