[X86] Don't scalarize v2f32->v2i64 strict_fp_to_sint/uint with avx512dq and not avx512vl.

We can pad the v2f32 with 0s up to v8f32 and use a v8f32->v8i64
operation. This is what we end up with on non-strict nodes except
we don't pad with 0s since we don't care about exceptions.
This commit is contained in:
Craig Topper 2020-06-07 14:44:17 -07:00
parent 3badd17b69
commit a135c4a2cf
3 changed files with 87 additions and 69 deletions

View File

@ -1719,6 +1719,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
}
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@ -1838,19 +1851,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
}
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
@ -20717,6 +20717,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
if (!Subtarget.hasVLX()) {
// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
// legalizer and then widened again by vector op legalization.
if (!IsStrict)
return SDValue();
SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
{Src, Zero, Zero, Zero});
Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
{Op->getOperand(0), Tmp});
SDValue Chain = Tmp.getValue(1);
Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Tmp, Chain}, dl);
return Tmp;
}
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));

View File

@ -685,26 +685,14 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[1],zero,zero,zero
; AVX512DQ-32-NEXT: vcvttps2qq %ymm1, %zmm1
; AVX512DQ-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512DQ-32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512DQ-32-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-32-NEXT: vzeroupper
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
; AVX512DQ-64: # %bb.0:
; AVX512DQ-64-NEXT: vcvttss2si %xmm0, %rax
; AVX512DQ-64-NEXT: vmovq %rax, %xmm1
; AVX512DQ-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512DQ-64-NEXT: vcvttss2si %xmm0, %rax
; AVX512DQ-64-NEXT: vmovq %rax, %xmm0
; AVX512DQ-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512DQ-64-NEXT: retq
; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: ret{{[l|q]}}
;
; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
; AVX512VLDQ: # %bb.0:
@ -1016,26 +1004,14 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[1],zero,zero,zero
; AVX512DQ-32-NEXT: vcvttps2uqq %ymm1, %zmm1
; AVX512DQ-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512DQ-32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512DQ-32-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-32-NEXT: vzeroupper
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
; AVX512DQ-64: # %bb.0:
; AVX512DQ-64-NEXT: vcvttss2usi %xmm0, %rax
; AVX512DQ-64-NEXT: vmovq %rax, %xmm1
; AVX512DQ-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512DQ-64-NEXT: vcvttss2usi %xmm0, %rax
; AVX512DQ-64-NEXT: vmovq %rax, %xmm0
; AVX512DQ-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512DQ-64-NEXT: retq
; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: ret{{[l|q]}}
;
; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
; AVX512VLDQ: # %bb.0:

View File

@ -3976,14 +3976,30 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() #0 {
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v2i64_v2f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f32:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptosi_v2i64_v2f32:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vcvttps2qq {{.*}}(%rip), %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(
<2 x float><float 42.0, float 43.0>,
@ -4588,14 +4604,21 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: retq
; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vcvttps2uqq {{.*}}(%rip), %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(
<2 x float><float 42.0, float 43.0>,