forked from OSchip/llvm-project
[VectorLegalizer] Enable TargetLowering::expandFP_TO_UINT support.
Add vector support to TargetLowering::expandFP_TO_UINT. This exposes an issue in X86TargetLowering::LowerVSELECT which was assuming that the select mask was the same width as the LHS/RHS ops - as long as the result is a sign splat we can easily sext/trunk this. llvm-svn: 345473
This commit is contained in:
parent
5b30571753
commit
9b77f0c291
|
@ -86,9 +86,10 @@ class VectorLegalizer {
|
||||||
/// operations to legalize them.
|
/// operations to legalize them.
|
||||||
SDValue Expand(SDValue Op);
|
SDValue Expand(SDValue Op);
|
||||||
|
|
||||||
/// Implements expansion for FNEG; falls back to UnrollVectorOp if
|
/// Implements expansion for FP_TO_UINT; falls back to UnrollVectorOp if
|
||||||
/// FSUB isn't legal.
|
/// FP_TO_SINT isn't legal.
|
||||||
///
|
SDValue ExpandFP_TO_UINT(SDValue Op);
|
||||||
|
|
||||||
/// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
|
/// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
|
||||||
/// SINT_TO_FLOAT and SHR on vectors isn't legal.
|
/// SINT_TO_FLOAT and SHR on vectors isn't legal.
|
||||||
SDValue ExpandUINT_TO_FLOAT(SDValue Op);
|
SDValue ExpandUINT_TO_FLOAT(SDValue Op);
|
||||||
|
@ -709,6 +710,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
|
||||||
return ExpandVSELECT(Op);
|
return ExpandVSELECT(Op);
|
||||||
case ISD::SELECT:
|
case ISD::SELECT:
|
||||||
return ExpandSELECT(Op);
|
return ExpandSELECT(Op);
|
||||||
|
case ISD::FP_TO_UINT:
|
||||||
|
return ExpandFP_TO_UINT(Op);
|
||||||
case ISD::UINT_TO_FP:
|
case ISD::UINT_TO_FP:
|
||||||
return ExpandUINT_TO_FLOAT(Op);
|
return ExpandUINT_TO_FLOAT(Op);
|
||||||
case ISD::FNEG:
|
case ISD::FNEG:
|
||||||
|
@ -1018,6 +1021,16 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
|
||||||
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
|
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SDValue VectorLegalizer::ExpandFP_TO_UINT(SDValue Op) {
|
||||||
|
// Attempt to expand using TargetLowering.
|
||||||
|
SDValue Result;
|
||||||
|
if (TLI.expandFP_TO_UINT(Op.getNode(), Result, DAG))
|
||||||
|
return Result;
|
||||||
|
|
||||||
|
// Otherwise go ahead and unroll.
|
||||||
|
return DAG.UnrollVectorOp(Op.getNode());
|
||||||
|
}
|
||||||
|
|
||||||
SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
|
SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
|
||||||
EVT VT = Op.getOperand(0).getValueType();
|
EVT VT = Op.getOperand(0).getValueType();
|
||||||
SDLoc DL(Op);
|
SDLoc DL(Op);
|
||||||
|
|
|
@ -4147,6 +4147,11 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
|
||||||
EVT SetCCVT =
|
EVT SetCCVT =
|
||||||
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
|
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
|
||||||
|
|
||||||
|
// Only expand vector types if we have the appropriate vector bit operations.
|
||||||
|
if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) ||
|
||||||
|
!isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
|
||||||
|
return false;
|
||||||
|
|
||||||
// Expand based on maximum range of FP_TO_SINT:
|
// Expand based on maximum range of FP_TO_SINT:
|
||||||
// True = fp_to_sint(Src)
|
// True = fp_to_sint(Src)
|
||||||
// False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
|
// False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
|
||||||
|
|
|
@ -15698,7 +15698,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||||
|
|
||||||
// If this VSELECT has a vector if i1 as a mask, it will be directly matched
|
// If this VSELECT has a vector if i1 as a mask, it will be directly matched
|
||||||
// with patterns on the mask registers on AVX-512.
|
// with patterns on the mask registers on AVX-512.
|
||||||
if (Cond.getScalarValueSizeInBits() == 1)
|
MVT CondVT = Cond.getSimpleValueType();
|
||||||
|
unsigned CondEltSize = Cond.getScalarValueSizeInBits();
|
||||||
|
if (CondEltSize == 1)
|
||||||
return Op;
|
return Op;
|
||||||
|
|
||||||
// Variable blends are only legal from SSE4.1 onward.
|
// Variable blends are only legal from SSE4.1 onward.
|
||||||
|
@ -15707,24 +15709,34 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||||
|
|
||||||
SDLoc dl(Op);
|
SDLoc dl(Op);
|
||||||
MVT VT = Op.getSimpleValueType();
|
MVT VT = Op.getSimpleValueType();
|
||||||
|
unsigned EltSize = VT.getScalarSizeInBits();
|
||||||
|
unsigned NumElts = VT.getVectorNumElements();
|
||||||
|
|
||||||
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
|
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
|
||||||
// into an i1 condition so that we can use the mask-based 512-bit blend
|
// into an i1 condition so that we can use the mask-based 512-bit blend
|
||||||
// instructions.
|
// instructions.
|
||||||
if (VT.getSizeInBits() == 512) {
|
if (VT.getSizeInBits() == 512) {
|
||||||
// The vNi1 condition case should be handled above as it can be trivially
|
|
||||||
// lowered.
|
|
||||||
assert(Cond.getScalarValueSizeInBits() == VT.getScalarSizeInBits() &&
|
|
||||||
"Should have a size-matched integer condition!");
|
|
||||||
// Build a mask by testing the condition against zero.
|
// Build a mask by testing the condition against zero.
|
||||||
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
|
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
|
||||||
SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
|
SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
|
||||||
getZeroVector(VT, Subtarget, DAG, dl),
|
getZeroVector(CondVT, Subtarget, DAG, dl),
|
||||||
ISD::SETNE);
|
ISD::SETNE);
|
||||||
// Now return a new VSELECT using the mask.
|
// Now return a new VSELECT using the mask.
|
||||||
return DAG.getSelect(dl, VT, Mask, LHS, RHS);
|
return DAG.getSelect(dl, VT, Mask, LHS, RHS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SEXT/TRUNC cases where the mask doesn't match the destination size.
|
||||||
|
if (CondEltSize != EltSize) {
|
||||||
|
// If we don't have a sign splat, rely on the expansion.
|
||||||
|
if (CondEltSize != DAG.ComputeNumSignBits(Cond))
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
MVT NewCondSVT = MVT::getIntegerVT(EltSize);
|
||||||
|
MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
|
||||||
|
Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
|
||||||
|
return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
|
||||||
|
}
|
||||||
|
|
||||||
// Only some types will be legal on some subtargets. If we can emit a legal
|
// Only some types will be legal on some subtargets. If we can emit a legal
|
||||||
// VSELECT-matching blend, return Op, and but if we need to expand, return
|
// VSELECT-matching blend, return Op, and but if we need to expand, return
|
||||||
// a null value.
|
// a null value.
|
||||||
|
@ -15743,7 +15755,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||||
case MVT::v8i16:
|
case MVT::v8i16:
|
||||||
case MVT::v16i16: {
|
case MVT::v16i16: {
|
||||||
// Bitcast everything to the vXi8 type and use a vXi8 vselect.
|
// Bitcast everything to the vXi8 type and use a vXi8 vselect.
|
||||||
MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
|
MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
|
||||||
Cond = DAG.getBitcast(CastVT, Cond);
|
Cond = DAG.getBitcast(CastVT, Cond);
|
||||||
LHS = DAG.getBitcast(CastVT, LHS);
|
LHS = DAG.getBitcast(CastVT, LHS);
|
||||||
RHS = DAG.getBitcast(CastVT, RHS);
|
RHS = DAG.getBitcast(CastVT, RHS);
|
||||||
|
|
|
@ -63,25 +63,19 @@ define double @trunc_unsigned_f64(double %x) #0 {
|
||||||
define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
|
define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
|
||||||
; SSE2-LABEL: trunc_unsigned_v4f32:
|
; SSE2-LABEL: trunc_unsigned_v4f32:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
|
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; SSE2-NEXT: movaps %xmm0, %xmm1
|
; SSE2-NEXT: movaps %xmm0, %xmm1
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
|
; SSE2-NEXT: cmpltps %xmm2, %xmm1
|
||||||
; SSE2-NEXT: cvttss2si %xmm1, %rax
|
; SSE2-NEXT: cvttps2dq %xmm0, %xmm3
|
||||||
; SSE2-NEXT: movd %eax, %xmm1
|
; SSE2-NEXT: subps %xmm2, %xmm0
|
||||||
; SSE2-NEXT: movaps %xmm0, %xmm2
|
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
|
||||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
|
; SSE2-NEXT: xorps {{.*}}(%rip), %xmm0
|
||||||
; SSE2-NEXT: cvttss2si %xmm2, %rax
|
; SSE2-NEXT: andps %xmm1, %xmm3
|
||||||
; SSE2-NEXT: movd %eax, %xmm2
|
; SSE2-NEXT: andnps %xmm0, %xmm1
|
||||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
; SSE2-NEXT: orps %xmm3, %xmm1
|
||||||
; SSE2-NEXT: cvttss2si %xmm0, %rax
|
; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
|
||||||
; SSE2-NEXT: movd %eax, %xmm1
|
; SSE2-NEXT: andps %xmm1, %xmm0
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
; SSE2-NEXT: orps {{.*}}(%rip), %xmm0
|
||||||
; SSE2-NEXT: cvttss2si %xmm0, %rax
|
|
||||||
; SSE2-NEXT: movd %eax, %xmm0
|
|
||||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
||||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
||||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535]
|
|
||||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
|
||||||
; SSE2-NEXT: por {{.*}}(%rip), %xmm0
|
|
||||||
; SSE2-NEXT: psrld $16, %xmm1
|
; SSE2-NEXT: psrld $16, %xmm1
|
||||||
; SSE2-NEXT: por {{.*}}(%rip), %xmm1
|
; SSE2-NEXT: por {{.*}}(%rip), %xmm1
|
||||||
; SSE2-NEXT: addps {{.*}}(%rip), %xmm1
|
; SSE2-NEXT: addps {{.*}}(%rip), %xmm1
|
||||||
|
|
|
@ -317,25 +317,13 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) {
|
||||||
;
|
;
|
||||||
; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
|
; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
|
||||||
; CHECK-WIDE: ## %bb.0:
|
; CHECK-WIDE: ## %bb.0:
|
||||||
; CHECK-WIDE-NEXT: subl $68, %esp
|
; CHECK-WIDE-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; CHECK-WIDE-NEXT: .cfi_def_cfa_offset 72
|
; CHECK-WIDE-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
|
||||||
; CHECK-WIDE-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
|
; CHECK-WIDE-NEXT: vsubps %xmm1, %xmm0, %xmm1
|
||||||
; CHECK-WIDE-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
|
; CHECK-WIDE-NEXT: vcvttps2dq %xmm1, %xmm1
|
||||||
; CHECK-WIDE-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp)
|
; CHECK-WIDE-NEXT: vxorps LCPI11_1, %xmm1, %xmm1
|
||||||
; CHECK-WIDE-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
|
; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
|
||||||
; CHECK-WIDE-NEXT: flds {{[0-9]+}}(%esp)
|
; CHECK-WIDE-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
; CHECK-WIDE-NEXT: fisttpll {{[0-9]+}}(%esp)
|
|
||||||
; CHECK-WIDE-NEXT: flds {{[0-9]+}}(%esp)
|
|
||||||
; CHECK-WIDE-NEXT: fisttpll {{[0-9]+}}(%esp)
|
|
||||||
; CHECK-WIDE-NEXT: flds {{[0-9]+}}(%esp)
|
|
||||||
; CHECK-WIDE-NEXT: fisttpll {{[0-9]+}}(%esp)
|
|
||||||
; CHECK-WIDE-NEXT: flds {{[0-9]+}}(%esp)
|
|
||||||
; CHECK-WIDE-NEXT: fisttpll (%esp)
|
|
||||||
; CHECK-WIDE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
||||||
; CHECK-WIDE-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
|
|
||||||
; CHECK-WIDE-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
|
|
||||||
; CHECK-WIDE-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0
|
|
||||||
; CHECK-WIDE-NEXT: addl $68, %esp
|
|
||||||
; CHECK-WIDE-NEXT: retl
|
; CHECK-WIDE-NEXT: retl
|
||||||
%res = fptoui <2 x float> %src to <2 x i32>
|
%res = fptoui <2 x float> %src to <2 x i32>
|
||||||
ret <2 x i32> %res
|
ret <2 x i32> %res
|
||||||
|
|
|
@ -627,16 +627,36 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
|
||||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; VEX-LABEL: fptoui_4f64_to_2i32:
|
; AVX1-LABEL: fptoui_4f64_to_2i32:
|
||||||
; VEX: # %bb.0:
|
; AVX1: # %bb.0:
|
||||||
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
; AVX1-NEXT: vmovapd %xmm0, %xmm0
|
||||||
; VEX-NEXT: vmovd %eax, %xmm1
|
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
|
||||||
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||||
; VEX-NEXT: vmovd %eax, %xmm0
|
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
||||||
; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1
|
||||||
; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
|
||||||
; VEX-NEXT: retq
|
; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
|
||||||
|
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
|
||||||
|
; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
|
; AVX1-NEXT: vzeroupper
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: fptoui_4f64_to_2i32:
|
||||||
|
; AVX2: # %bb.0:
|
||||||
|
; AVX2-NEXT: vmovapd %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
|
; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
|
||||||
|
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||||
|
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
||||||
|
; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
|
||||||
|
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
|
||||||
|
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
|
; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
|
||||||
|
; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
|
||||||
|
; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
|
; AVX2-NEXT: vzeroupper
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: fptoui_4f64_to_2i32:
|
; AVX512F-LABEL: fptoui_4f64_to_2i32:
|
||||||
; AVX512F: # %bb.0:
|
; AVX512F: # %bb.0:
|
||||||
|
@ -930,21 +950,34 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
|
||||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; VEX-LABEL: fptoui_4f64_to_4i32:
|
; AVX1-LABEL: fptoui_4f64_to_4i32:
|
||||||
; VEX: # %bb.0:
|
; AVX1: # %bb.0:
|
||||||
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
; VEX-NEXT: vcvttsd2si %xmm1, %rax
|
; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
|
||||||
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||||
; VEX-NEXT: vmovd %ecx, %xmm1
|
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
||||||
; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1
|
||||||
; VEX-NEXT: vextractf128 $1, %ymm0, %xmm0
|
; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
|
||||||
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
|
||||||
; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
|
||||||
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
; VEX-NEXT: vcvttsd2si %xmm0, %rax
|
; AVX1-NEXT: vzeroupper
|
||||||
; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
|
; AVX1-NEXT: retq
|
||||||
; VEX-NEXT: vzeroupper
|
;
|
||||||
; VEX-NEXT: retq
|
; AVX2-LABEL: fptoui_4f64_to_4i32:
|
||||||
|
; AVX2: # %bb.0:
|
||||||
|
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
|
; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
|
||||||
|
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||||
|
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
||||||
|
; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
|
||||||
|
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
|
||||||
|
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
|
; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
|
||||||
|
; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
|
||||||
|
; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
|
; AVX2-NEXT: vzeroupper
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: fptoui_4f64_to_4i32:
|
; AVX512F-LABEL: fptoui_4f64_to_4i32:
|
||||||
; AVX512F: # %bb.0:
|
; AVX512F: # %bb.0:
|
||||||
|
@ -1570,39 +1603,41 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
|
||||||
define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
|
define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
|
||||||
; SSE-LABEL: fptoui_4f32_to_4i32:
|
; SSE-LABEL: fptoui_4f32_to_4i32:
|
||||||
; SSE: # %bb.0:
|
; SSE: # %bb.0:
|
||||||
|
; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; SSE-NEXT: movaps %xmm0, %xmm1
|
; SSE-NEXT: movaps %xmm0, %xmm1
|
||||||
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
|
; SSE-NEXT: cmpltps %xmm2, %xmm1
|
||||||
; SSE-NEXT: cvttss2si %xmm1, %rax
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm3
|
||||||
; SSE-NEXT: movd %eax, %xmm1
|
; SSE-NEXT: subps %xmm2, %xmm0
|
||||||
; SSE-NEXT: movaps %xmm0, %xmm2
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
|
||||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
|
; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
|
||||||
; SSE-NEXT: cvttss2si %xmm2, %rax
|
; SSE-NEXT: andps %xmm1, %xmm3
|
||||||
; SSE-NEXT: movd %eax, %xmm2
|
; SSE-NEXT: andnps %xmm0, %xmm1
|
||||||
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
; SSE-NEXT: orps %xmm3, %xmm1
|
||||||
; SSE-NEXT: cvttss2si %xmm0, %rax
|
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||||
; SSE-NEXT: movd %eax, %xmm1
|
|
||||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
||||||
; SSE-NEXT: cvttss2si %xmm0, %rax
|
|
||||||
; SSE-NEXT: movd %eax, %xmm0
|
|
||||||
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
||||||
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
||||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; VEX-LABEL: fptoui_4f32_to_4i32:
|
; AVX1-LABEL: fptoui_4f32_to_4i32:
|
||||||
; VEX: # %bb.0:
|
; AVX1: # %bb.0:
|
||||||
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; VEX-NEXT: vcvttss2si %xmm1, %rax
|
; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
|
||||||
; VEX-NEXT: vcvttss2si %xmm0, %rcx
|
; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1
|
||||||
; VEX-NEXT: vmovd %ecx, %xmm1
|
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
|
||||||
; VEX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1
|
||||||
; VEX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
|
||||||
; VEX-NEXT: vcvttss2si %xmm2, %rax
|
; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
; VEX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
; AVX1-NEXT: retq
|
||||||
; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
;
|
||||||
; VEX-NEXT: vcvttss2si %xmm0, %rax
|
; AVX2-LABEL: fptoui_4f32_to_4i32:
|
||||||
; VEX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
|
; AVX2: # %bb.0:
|
||||||
; VEX-NEXT: retq
|
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
|
; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
|
||||||
|
; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1
|
||||||
|
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
|
; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1
|
||||||
|
; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: fptoui_4f32_to_4i32:
|
; AVX512F-LABEL: fptoui_4f32_to_4i32:
|
||||||
; AVX512F: # %bb.0:
|
; AVX512F: # %bb.0:
|
||||||
|
@ -1853,95 +1888,51 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
|
||||||
define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
|
define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
|
||||||
; SSE-LABEL: fptoui_8f32_to_8i32:
|
; SSE-LABEL: fptoui_8f32_to_8i32:
|
||||||
; SSE: # %bb.0:
|
; SSE: # %bb.0:
|
||||||
|
; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; SSE-NEXT: movaps %xmm0, %xmm2
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
||||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
; SSE-NEXT: cmpltps %xmm4, %xmm2
|
||||||
; SSE-NEXT: cvttss2si %xmm0, %rax
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm3
|
||||||
; SSE-NEXT: movd %eax, %xmm0
|
; SSE-NEXT: subps %xmm4, %xmm0
|
||||||
; SSE-NEXT: movaps %xmm2, %xmm3
|
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
|
||||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
|
; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
; SSE-NEXT: cvttss2si %xmm3, %rax
|
; SSE-NEXT: xorps %xmm5, %xmm0
|
||||||
; SSE-NEXT: movd %eax, %xmm3
|
; SSE-NEXT: andps %xmm2, %xmm3
|
||||||
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
|
; SSE-NEXT: andnps %xmm0, %xmm2
|
||||||
; SSE-NEXT: cvttss2si %xmm2, %rax
|
; SSE-NEXT: orps %xmm3, %xmm2
|
||||||
; SSE-NEXT: movd %eax, %xmm0
|
|
||||||
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
|
||||||
; SSE-NEXT: cvttss2si %xmm2, %rax
|
|
||||||
; SSE-NEXT: movd %eax, %xmm2
|
|
||||||
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
||||||
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
|
||||||
; SSE-NEXT: movaps %xmm1, %xmm2
|
|
||||||
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
|
|
||||||
; SSE-NEXT: cvttss2si %xmm2, %rax
|
|
||||||
; SSE-NEXT: movd %eax, %xmm2
|
|
||||||
; SSE-NEXT: movaps %xmm1, %xmm3
|
; SSE-NEXT: movaps %xmm1, %xmm3
|
||||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
|
; SSE-NEXT: cmpltps %xmm4, %xmm3
|
||||||
; SSE-NEXT: cvttss2si %xmm3, %rax
|
; SSE-NEXT: cvttps2dq %xmm1, %xmm0
|
||||||
; SSE-NEXT: movd %eax, %xmm3
|
; SSE-NEXT: subps %xmm4, %xmm1
|
||||||
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
|
||||||
; SSE-NEXT: cvttss2si %xmm1, %rax
|
; SSE-NEXT: xorps %xmm5, %xmm1
|
||||||
; SSE-NEXT: movd %eax, %xmm2
|
; SSE-NEXT: andps %xmm3, %xmm0
|
||||||
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
; SSE-NEXT: andnps %xmm1, %xmm3
|
||||||
; SSE-NEXT: cvttss2si %xmm1, %rax
|
; SSE-NEXT: orps %xmm0, %xmm3
|
||||||
; SSE-NEXT: movd %eax, %xmm1
|
; SSE-NEXT: movaps %xmm2, %xmm0
|
||||||
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
; SSE-NEXT: movaps %xmm3, %xmm1
|
||||||
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
|
||||||
; SSE-NEXT: movdqa %xmm2, %xmm1
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX1-LABEL: fptoui_8f32_to_8i32:
|
; AVX1-LABEL: fptoui_8f32_to_8i32:
|
||||||
; AVX1: # %bb.0:
|
; AVX1: # %bb.0:
|
||||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
|
||||||
; AVX1-NEXT: vcvttss2si %xmm2, %rax
|
; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1
|
||||||
; AVX1-NEXT: vcvttss2si %xmm1, %rcx
|
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
|
||||||
; AVX1-NEXT: vmovd %ecx, %xmm2
|
; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
|
||||||
; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
|
||||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
|
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
|
||||||
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
||||||
; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
||||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
||||||
; AVX1-NEXT: vcvttss2si %xmm1, %rax
|
|
||||||
; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
|
|
||||||
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
|
||||||
; AVX1-NEXT: vcvttss2si %xmm2, %rax
|
|
||||||
; AVX1-NEXT: vcvttss2si %xmm0, %rcx
|
|
||||||
; AVX1-NEXT: vmovd %ecx, %xmm2
|
|
||||||
; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
|
||||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
||||||
; AVX1-NEXT: vcvttss2si %xmm3, %rax
|
|
||||||
; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
||||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
||||||
; AVX1-NEXT: vcvttss2si %xmm0, %rax
|
|
||||||
; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
|
|
||||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
||||||
; AVX1-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX2-LABEL: fptoui_8f32_to_8i32:
|
; AVX2-LABEL: fptoui_8f32_to_8i32:
|
||||||
; AVX2: # %bb.0:
|
; AVX2: # %bb.0:
|
||||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
|
||||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
|
||||||
; AVX2-NEXT: vcvttss2si %xmm2, %rax
|
; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1
|
||||||
; AVX2-NEXT: vcvttss2si %xmm1, %rcx
|
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
|
||||||
; AVX2-NEXT: vmovd %ecx, %xmm2
|
; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
|
||||||
; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1
|
||||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
|
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
|
||||||
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
|
||||||
; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
||||||
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
|
||||||
; AVX2-NEXT: vcvttss2si %xmm1, %rax
|
|
||||||
; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
|
|
||||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
|
||||||
; AVX2-NEXT: vcvttss2si %xmm2, %rax
|
|
||||||
; AVX2-NEXT: vcvttss2si %xmm0, %rcx
|
|
||||||
; AVX2-NEXT: vmovd %ecx, %xmm2
|
|
||||||
; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
|
||||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
|
|
||||||
; AVX2-NEXT: vcvttss2si %xmm3, %rax
|
|
||||||
; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
|
||||||
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
||||||
; AVX2-NEXT: vcvttss2si %xmm0, %rax
|
|
||||||
; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
|
|
||||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
||||||
; AVX2-NEXT: retq
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: fptoui_8f32_to_8i32:
|
; AVX512F-LABEL: fptoui_8f32_to_8i32:
|
||||||
|
|
Loading…
Reference in New Issue