[X86][FP16] Only generate approximate rsqrt when Reciprocal is true for half type

We have reasonable fast sqrt and accurate rsqrt for half type due to the limited fractions. So neither do we need multi steps refinement for rsqrt nor replace sqrt by rsqrt. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D114844
2021-12-02 13:11:07 +08:00 · 2021-12-02 13:11:07 +08:00 · f13b43d570
parent 4756a2f157
commit f13b43d570
3 changed files with 7 additions and 10 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -23190,6 +23190,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
  EVT VT = Op.getValueType();

+  // We don't need to replace SQRT with RSQRT for half type.
+  if (VT.getScalarType() == MVT::f16)
+    return true;
+
  // We never want to use both SQRT and RSQRT instructions for the same input.
  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
    return false;
@ -23236,6 +23240,7 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

  if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
      Subtarget.hasFP16()) {
+    assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
    if (RefinementSteps == ReciprocalEstimate::Unspecified)
      RefinementSteps = 0;

--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@ -123,12 +123,7 @@ define half @test_sqrt_sh2(half %a0, half %a1) {
 define half @test_sqrt_sh3(half %a0, half %a1) {
 ; CHECK-LABEL: test_sqrt_sh3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; CHECK-NEXT:    vrsqrtsh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
+; CHECK-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
  %1 = call fast half @llvm.sqrt.f16(half %a0)
  ret half %1
--- a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
@ -972,10 +972,7 @@ define <8 x half> @test_sqrt_ph_128_fast(<8 x half> %a0, <8 x half> %a1) {
 define <8 x half> @test_sqrt_ph_128_fast2(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: test_sqrt_ph_128_fast2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vcmpgeph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm1, %k1
-; CHECK-NEXT:    vrsqrtph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vsqrtph %xmm0, %xmm0
 ; CHECK-NEXT:    retq
  %1 = call fast <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
  ret <8 x half> %1