[DAGCombiner] try repeated fdiv divisor transform before building estimate (2nd try)

The original patch was committed at rL359398 and reverted at rL359695 because of infinite looping. This includes a fix to check for a vector splat of "1.0" to avoid the infinite loop. Original commit message: This was originally part of D61028, but it's an independent diff. If we try the repeated divisor reciprocal transform before producing an estimate sequence, then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5 vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the full-precision division is only 3 cycle throughput, so that's probably the better perf default option and avoids problems from x86's inaccurate estimates. The last 2 tests show that users still have the option to override the defaults by using the function attributes for reciprocal estimates, but those patterns are potentially made faster by converting the vector ops (including ymm ops) to scalar math. Differential Revision: https://reviews.llvm.org/D61149 llvm-svn: 359793
2019-05-02 15:02:08 +00:00 · 2019-05-02 15:02:08 +00:00 · 1972826178
parent 4ad6dbc5fd
commit 1972826178
2 changed files with 31 additions and 43 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -11915,7 +11915,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {

  // Skip if current node is a reciprocal.
  SDValue N0 = N->getOperand(0);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
  if (N0CFP && N0CFP->isExactlyValue(1.0))
    return SDValue();

@ -11993,6 +11993,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
  if (SDValue NewSel = foldBinOpIntoSelect(N))
    return NewSel;

+  if (SDValue V = combineRepeatedFPDivisors(N))
+    return V;
+
  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
    if (N1CFP) {
@ -12082,9 +12085,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
    }
  }

-  if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
-    return CombineRepeatedDivisors;
-
  return SDValue();
 }

--- a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll
+++ b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll
@ -51,25 +51,17 @@ define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) {
 define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
 ; SSE-LABEL: splat_fdiv_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT:    rcpps %xmm1, %xmm2
-; SSE-NEXT:    mulps %xmm2, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT:    subps %xmm1, %xmm3
-; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    addps %xmm2, %xmm3
-; SSE-NEXT:    mulps %xmm3, %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    divss %xmm1, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT:    mulps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splat_fdiv_v4f32:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX-NEXT:    vrcpps %xmm1, %xmm2
-; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT:    vsubps %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vaddps %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
  %vy = insertelement <4 x float> undef, float %y, i32 0
@ -90,14 +82,10 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
 ;
 ; AVX-LABEL: splat_fdiv_v8f32:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX-NEXT:    vrcpps %ymm1, %ymm2
-; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT:    vsubps %ymm1, %ymm3, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm2, %ymm1
-; AVX-NEXT:    vaddps %ymm1, %ymm2, %ymm1
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
  %vy = insertelement <8 x float> undef, float %y, i32 0
@ -109,25 +97,25 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
 define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
 ; SSE-LABEL: splat_fdiv_v4f32_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT:    rcpps %xmm1, %xmm2
-; SSE-NEXT:    mulps %xmm2, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT:    subps %xmm1, %xmm3
-; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    addps %xmm2, %xmm3
+; SSE-NEXT:    rcpss %xmm1, %xmm2
+; SSE-NEXT:    mulss %xmm2, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT:    subss %xmm1, %xmm3
+; SSE-NEXT:    mulss %xmm2, %xmm3
+; SSE-NEXT:    addss %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-NEXT:    mulps %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splat_fdiv_v4f32_estimate:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
+; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubss %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX-NEXT:    vrcpps %xmm1, %xmm2
-; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT:    vsubps %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vaddps %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
  %vy = insertelement <4 x float> undef, float %y, i32 0
@ -152,14 +140,14 @@ define <8 x float> @splat_fdiv_v8f32_estimate(<8 x float> %x, float %y) #0 {
 ;
 ; AVX-LABEL: splat_fdiv_v8f32_estimate:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
+; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubss %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX-NEXT:    vrcpps %ymm1, %ymm2
-; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT:    vsubps %ymm1, %ymm3, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm2, %ymm1
-; AVX-NEXT:    vaddps %ymm1, %ymm2, %ymm1
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
  %vy = insertelement <8 x float> undef, float %y, i32 0