[DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x), better

I tried to fix this in: rG716e35a0cf53 ...but that patch depends on the order that we encounter the magic "x/sqrt(x)" expression in the combiner's worklist. This patch should improve that by waiting until we walk the user list to decide if there's a use to skip. The AArch64 test reveals another (existing) ordering problem though - we may try to create an estimate for plain sqrt(x) before we see that it is part of a 1/sqrt(x) expression.
2020-08-31 09:31:15 -04:00 · 2020-08-31 09:31:15 -04:00 · 1c9a09f42e
parent 11e0c5b648
commit 1c9a09f42e
3 changed files with 37 additions and 35 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -13235,11 +13235,6 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
  if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
    return SDValue();

-  // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
-  if (N1.getOpcode() == ISD::FSQRT && N1.getOperand(0) == N0 &&
-      Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())
-    return SDValue();
-
  // Exit early if the target does not want this transform or if there can't
  // possibly be enough uses of the divisor to make the transform worthwhile.
  unsigned MinUses = TLI.combineRepeatedFPDivisors();
@ -13259,6 +13254,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
  SetVector<SDNode *> Users;
  for (auto *U : N1->uses()) {
    if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+      // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
+      if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
+          U->getOperand(0) == U->getOperand(1).getOperand(0) &&
+          U->getFlags().hasAllowReassociation() &&
+          U->getFlags().hasNoSignedZeros())
+        continue;
+
      // This division is eligible for optimization only if global unsafe math
      // is enabled or if this division allows reciprocal formation.
      if (UnsafeMath || U->getFlags().hasAllowReciprocal())
@ -13470,6 +13472,10 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
    return SDValue();

  // FSQRT nodes have flags that propagate to the created nodes.
+  // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
+  //       transform the fdiv, we may produce a sub-optimal estimate sequence
+  //       because the reciprocal calculation may not have to filter out a
+  //       0.0 input.
  return buildSqrtEstimate(N0, Flags);
 }

--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@ -570,19 +570,16 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, double* %p1, double*
 define double @sqrt_simplify_before_recip_3_uses_order(double %x, double* %p1, double* %p2) nounwind {
 ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order:
 ; FAULT:       // %bb.0:
+; FAULT-NEXT:    mov x9, #140737488355328
 ; FAULT-NEXT:    mov x8, #4631107791820423168
-; FAULT-NEXT:    fmov d3, x8
-; FAULT-NEXT:    mov x8, #140737488355328
-; FAULT-NEXT:    fsqrt d1, d0
-; FAULT-NEXT:    fmov d2, #1.00000000
-; FAULT-NEXT:    movk x8, #16453, lsl #48
-; FAULT-NEXT:    fdiv d1, d2, d1
-; FAULT-NEXT:    fmov d2, x8
-; FAULT-NEXT:    fmul d0, d0, d1
-; FAULT-NEXT:    fmul d3, d1, d3
-; FAULT-NEXT:    fmul d1, d1, d2
-; FAULT-NEXT:    str d3, [x0]
-; FAULT-NEXT:    str d1, [x1]
+; FAULT-NEXT:    movk x9, #16453, lsl #48
+; FAULT-NEXT:    fsqrt d0, d0
+; FAULT-NEXT:    fmov d1, x8
+; FAULT-NEXT:    fmov d2, x9
+; FAULT-NEXT:    fdiv d1, d1, d0
+; FAULT-NEXT:    fdiv d2, d2, d0
+; FAULT-NEXT:    str d1, [x0]
+; FAULT-NEXT:    str d2, [x1]
 ; FAULT-NEXT:    ret
 ;
 ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order:
@ -644,21 +641,24 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, double* %p1, double*
 ; CHECK-NEXT:    fmul d1, d1, d3
 ; CHECK-NEXT:    fmul d3, d1, d1
 ; CHECK-NEXT:    frsqrts d3, d0, d3
-; CHECK-NEXT:    mov x8, #4631107791820423168
 ; CHECK-NEXT:    fmul d1, d1, d3
+; CHECK-NEXT:    mov x8, #4631107791820423168
+; CHECK-NEXT:    fmul d3, d1, d1
 ; CHECK-NEXT:    fmov d2, x8
 ; CHECK-NEXT:    mov x8, #140737488355328
-; CHECK-NEXT:    fmul d3, d1, d1
-; CHECK-NEXT:    movk x8, #16453, lsl #48
 ; CHECK-NEXT:    frsqrts d3, d0, d3
+; CHECK-NEXT:    movk x8, #16453, lsl #48
 ; CHECK-NEXT:    fmul d1, d1, d3
-; CHECK-NEXT:    fmov d3, x8
+; CHECK-NEXT:    fcmp d0, #0.0
+; CHECK-NEXT:    fmov d4, x8
+; CHECK-NEXT:    fmul d3, d0, d1
 ; CHECK-NEXT:    fmul d2, d1, d2
-; CHECK-NEXT:    fmul d3, d1, d3
-; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d4, d1, d4
 ; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    fcsel d1, d0, d3, eq
+; CHECK-NEXT:    fdiv d0, d0, d1
 ; CHECK-NEXT:    str d2, [x1]
-; CHECK-NEXT:    str d3, [x2]
+; CHECK-NEXT:    str d4, [x2]
 ; CHECK-NEXT:    ret
  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
  %rsqrt = fdiv fast double 1.0, %sqrt
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@ -997,21 +997,17 @@ define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double
 define double @sqrt_simplify_before_recip_order(double %x, double* %p) nounwind {
 ; SSE-LABEL: sqrt_simplify_before_recip_order:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    sqrtsd %xmm0, %xmm1
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT:    divsd %xmm1, %xmm2
-; SSE-NEXT:    mulsd %xmm2, %xmm0
-; SSE-NEXT:    mulsd {{.*}}(%rip), %xmm2
-; SSE-NEXT:    movsd %xmm2, (%rdi)
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movsd %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrt_simplify_before_recip_order:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vdivsd %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vmovsd %xmm1, (%rdi)
 ; AVX-NEXT:    retq
  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)