[x86] use vector instructions to lower more FP->int->FP casts

This is an enhancement to D77895 to avoid another
round-trip from XMM->GPR->XMM. This time we handle
the case of starting/ending with an f64 and casting
to signed i32 as the intermediate value.

It's a bit more involved than I initially assumed
because we need to use target-specific opcodes to
represent the non-standard cast ops.

Differential Revision: https://reviews.llvm.org/D78362
This commit is contained in:
Sanjay Patel 2020-04-19 08:32:02 -04:00
parent 8c68de2d63
commit cceb630a07
4 changed files with 35 additions and 29 deletions

View File

@ -19178,17 +19178,25 @@ static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
MVT IntVT = CastToInt.getSimpleValueType();
SDValue X = CastToInt.getOperand(0);
// TODO: Allow size-changing from source to dest (double -> i32 -> float)
if (X.getSimpleValueType() != VT ||
VT.getSizeInBits() != IntVT.getSizeInBits())
if (X.getSimpleValueType() != VT)
return SDValue();
// See if we have a 128-bit vector cast op for this type of cast.
unsigned NumEltsInXMM = 128 / VT.getScalarSizeInBits();
MVT VecFPVT = MVT::getVectorVT(VT, NumEltsInXMM);
MVT VecIntVT = MVT::getVectorVT(IntVT, NumEltsInXMM);
if (!useVectorCast(CastToFP.getOpcode(), VecIntVT, VecFPVT, Subtarget))
// See if we have 128-bit vector cast instructions for this type of cast.
// We need cvttps2dq + cvtdq2ps or cvttpd2dq + cvtdq2pd.
if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
IntVT != MVT::i32)
return SDValue();
unsigned NumFPEltsInXMM = 128 / VT.getScalarSizeInBits();
unsigned NumIntEltsInXMM = 128 / IntVT.getScalarSizeInBits();
MVT VecFPVT = MVT::getVectorVT(VT, NumFPEltsInXMM);
MVT VecIntVT = MVT::getVectorVT(IntVT, NumIntEltsInXMM);
// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
bool NeedX86Opcodes = VT.getSizeInBits() != IntVT.getSizeInBits();
unsigned ToIntOpcode = NeedX86Opcodes ? X86ISD::CVTTP2SI : ISD::FP_TO_SINT;
unsigned ToFPOpcode = NeedX86Opcodes ? X86ISD::CVTSI2P : ISD::SINT_TO_FP;
// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
//
// We are not defining the high elements (for example, zero them) because
@ -19198,8 +19206,8 @@ static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
SDLoc DL(CastToFP);
SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecFPVT, X);
SDValue VCastToInt = DAG.getNode(ISD::FP_TO_SINT, DL, VecIntVT, VecX);
SDValue VCastToFP = DAG.getNode(ISD::SINT_TO_FP, DL, VecFPVT, VCastToInt);
SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecFPVT, VCastToInt);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
}

View File

@ -263,15 +263,14 @@ define float @trunc_signed_f32_nsz(float %x) #0 {
define double @trunc_signed32_f64_no_fast_math(double %x) {
; SSE-LABEL: trunc_signed32_f64_no_fast_math:
; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sd %eax, %xmm0
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_signed32_f64_no_fast_math:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvttsd2si %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %eax, %xmm1, %xmm0
; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-NEXT: retq
%i = fptosi double %x to i32
%r = sitofp i32 %i to double
@ -281,9 +280,8 @@ define double @trunc_signed32_f64_no_fast_math(double %x) {
define double @trunc_signed32_f64_nsz(double %x) #0 {
; SSE2-LABEL: trunc_signed32_f64_nsz:
; SSE2: # %bb.0:
; SSE2-NEXT: cvttsd2si %xmm0, %eax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2sd %eax, %xmm0
; SSE2-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: trunc_signed32_f64_nsz:

View File

@ -7,8 +7,8 @@
define i32 @isint_return(double %d) nounwind {
; CHECK64-LABEL: isint_return:
; CHECK64: # %bb.0:
; CHECK64-NEXT: cvttsd2si %xmm0, %eax
; CHECK64-NEXT: cvtsi2sd %eax, %xmm1
; CHECK64-NEXT: cvttpd2dq %xmm0, %xmm1
; CHECK64-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK64-NEXT: cmpeqsd %xmm0, %xmm1
; CHECK64-NEXT: movq %xmm1, %rax
; CHECK64-NEXT: andl $1, %eax
@ -18,8 +18,8 @@ define i32 @isint_return(double %d) nounwind {
; CHECK32-LABEL: isint_return:
; CHECK32: # %bb.0:
; CHECK32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK32-NEXT: cvttsd2si %xmm0, %eax
; CHECK32-NEXT: cvtsi2sd %eax, %xmm1
; CHECK32-NEXT: cvttpd2dq %xmm0, %xmm1
; CHECK32-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK32-NEXT: cmpeqsd %xmm0, %xmm1
; CHECK32-NEXT: movd %xmm1, %eax
; CHECK32-NEXT: andl $1, %eax
@ -62,8 +62,8 @@ declare void @foo()
define void @isint_branch(double %d) nounwind {
; CHECK64-LABEL: isint_branch:
; CHECK64: # %bb.0:
; CHECK64-NEXT: cvttsd2si %xmm0, %eax
; CHECK64-NEXT: cvtsi2sd %eax, %xmm1
; CHECK64-NEXT: cvttpd2dq %xmm0, %xmm1
; CHECK64-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK64-NEXT: ucomisd %xmm1, %xmm0
; CHECK64-NEXT: jne .LBB2_2
; CHECK64-NEXT: jp .LBB2_2
@ -77,8 +77,8 @@ define void @isint_branch(double %d) nounwind {
; CHECK32-LABEL: isint_branch:
; CHECK32: # %bb.0:
; CHECK32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK32-NEXT: cvttsd2si %xmm0, %eax
; CHECK32-NEXT: cvtsi2sd %eax, %xmm1
; CHECK32-NEXT: cvttpd2dq %xmm0, %xmm1
; CHECK32-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK32-NEXT: ucomisd %xmm1, %xmm0
; CHECK32-NEXT: jne .LBB2_2
; CHECK32-NEXT: jp .LBB2_2

View File

@ -5,8 +5,8 @@ define zeroext i8 @t(double %x) nounwind readnone {
; CHECK-LABEL: t:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: cvttsd2si %xmm0, %eax
; CHECK-NEXT: cvtsi2sd %eax, %xmm1
; CHECK-NEXT: cvttpd2dq %xmm0, %xmm1
; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK-NEXT: cmpeqsd %xmm0, %xmm1
; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: andl $1, %eax
@ -24,8 +24,8 @@ define zeroext i8 @u(double %x) nounwind readnone {
; CHECK-LABEL: u:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: cvttsd2si %xmm0, %eax
; CHECK-NEXT: cvtsi2sd %eax, %xmm1
; CHECK-NEXT: cvttpd2dq %xmm0, %xmm1
; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK-NEXT: cmpneqsd %xmm0, %xmm1
; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: andl $1, %eax