From 234a5e8ea422bb97c2b15848fccf5c396023b729 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 21 Feb 2019 20:40:39 +0000 Subject: [PATCH] [x86] vectorize more cast ops in lowering to avoid register file transfers This is a follow-up to D56864. If we're extracting from a non-zero index before casting to FP, then shuffle the vector and optionally narrow the vector before doing the cast: cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 This might be enough to close PR39974: https://bugs.llvm.org/show_bug.cgi?id=39974 Differential Revision: https://reviews.llvm.org/D58197 llvm-svn: 354619 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 29 ++++-- .../test/CodeGen/X86/known-signbits-vector.ll | 4 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 93 +++++++++++++------ 3 files changed, 90 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c75fabbac48b..dfc7d8ba200b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17636,28 +17636,41 @@ static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, /// round-trip between XMM and GPR. static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // TODO: The limitation for extracting from the 0-element is not required, - // but if we extract from some other element, it will require shuffling to - // get the result into the right place. // TODO: This could be enhanced to handle smaller integer types by peeking // through an extend. SDValue Extract = Cast.getOperand(0); MVT DestVT = Cast.getSimpleValueType(); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isNullConstant(Extract.getOperand(1))) + !isa(Extract.getOperand(1))) return SDValue(); + // See if we have a 128-bit vector cast op for this type of cast. SDValue VecOp = Extract.getOperand(0); MVT FromVT = VecOp.getSimpleValueType(); - MVT ToVT = MVT::getVectorVT(DestVT, FromVT.getVectorNumElements()); - if (!useVectorCast(Cast.getOpcode(), FromVT, ToVT, Subtarget)) + unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); + MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); + MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); + if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) return SDValue(); - // cast (extract V, Y) --> extract (cast V), Y + // If we are extracting from a non-zero element, first shuffle the source + // vector to allow extracting from element zero. SDLoc DL(Cast); + if (!isNullConstant(Extract.getOperand(1))) { + SmallVector Mask(FromVT.getVectorNumElements(), -1); + Mask[0] = Extract.getConstantOperandVal(1); + VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); + } + // If the source vector is wider than 128-bits, extract the low part. Do not + // create an unnecessarily wide vector cast op. + if (FromVT != Vec128VT) + VecOp = extract128BitVector(VecOp, 0, DAG, DL); + + // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 + // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, - Extract.getOperand(1)); + DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index c0406046ba68..5ab1cf2172ba 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -65,8 +65,8 @@ define float @signbits_ashr_extract_sitofp_0(<2 x i64> %a0) nounwind { ; X32-LABEL: signbits_ashr_extract_sitofp_0: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: vextractps $1, %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 1f2749fd0994..ab3fd395d337 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -5726,23 +5726,16 @@ define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind { ; Extract non-zero element from int vector and convert to FP. define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_sitofp_v4i32_f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_sitofp_v4i32_f32: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ssl %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract3_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: extract3_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to float @@ -5767,8 +5760,8 @@ define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind { ; ; AVX-LABEL: extract3_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to double @@ -5797,11 +5790,33 @@ define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind { ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract3_uitofp_v4i32_f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax -; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract3_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = uitofp i32 %e to float ret float %r @@ -5829,11 +5844,37 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind { ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract3_uitofp_v4i32_f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax -; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract3_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = uitofp i32 %e to double ret double %r