llvm-project/llvm/test/CodeGen/X86/fdiv-combine.ll

; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s

; More than one 'arcp' division using a single divisor operand
; should be converted into a reciprocal and multiplication.

; Don't do anything for just one division.

define float @div1_arcp(float %x, float %y, float %z) {
; CHECK-LABEL: div1_arcp:
; CHECK:       # BB#0:
; CHECK-NEXT:    divss %xmm1, %xmm0
; CHECK-NEXT:    retq
  %div1 = fdiv arcp float %x, %y
  ret float %div1
}

; All math instructions are 'arcp', so optimize.

define float @div2_arcp_all(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_all:
; CHECK:       # BB#0:
; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT:    divss %xmm2, %xmm3
; CHECK-NEXT:    mulss %xmm3, %xmm0
; CHECK-NEXT:    mulss %xmm1, %xmm0
; CHECK-NEXT:    mulss %xmm3, %xmm0
; CHECK-NEXT:    retq
  %div1 = fdiv arcp float %x, %z
  %mul = fmul arcp float %div1, %y
  %div2 = fdiv arcp float %mul, %z
  ret float %div2
}

; The first division is not 'arcp', so do not optimize.

define float @div2_arcp_partial1(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_partial1:
; CHECK:       # BB#0:
; CHECK-NEXT:    divss %xmm2, %xmm0
; CHECK-NEXT:    mulss %xmm1, %xmm0
; CHECK-NEXT:    divss %xmm2, %xmm0
; CHECK-NEXT:    retq
  %div1 = fdiv float %x, %z
  %mul = fmul arcp float %div1, %y
  %div2 = fdiv arcp float %mul, %z
  ret float %div2
}

; The second division is not 'arcp', so do not optimize.

define float @div2_arcp_partial2(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_partial2:
; CHECK:       # BB#0:
; CHECK-NEXT:    divss %xmm2, %xmm0
; CHECK-NEXT:    mulss %xmm1, %xmm0
; CHECK-NEXT:    divss %xmm2, %xmm0
; CHECK-NEXT:    retq
  %div1 = fdiv arcp float %x, %z
  %mul = fmul arcp float %div1, %y
  %div2 = fdiv float %mul, %z
  ret float %div2
}

; The multiply is not 'arcp', but that does not prevent optimizing the divisions.

define float @div2_arcp_partial3(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_partial3:
; CHECK:       # BB#0:
; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT:    divss %xmm2, %xmm3
; CHECK-NEXT:    mulss %xmm3, %xmm0
; CHECK-NEXT:    mulss %xmm1, %xmm0
; CHECK-NEXT:    mulss %xmm3, %xmm0
; CHECK-NEXT:    retq
  %div1 = fdiv arcp float %x, %z
  %mul = fmul float %div1, %y
  %div2 = fdiv arcp float %mul, %z
  ret float %div2
}

; If the reciprocal is already calculated, we should not
; generate an extra multiplication by 1.0. 

define double @div3_arcp(double %x, double %y, double %z) {
; CHECK-LABEL: div3_arcp:
; CHECK:       # BB#0:
; CHECK-NEXT:    movsd{{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT:    divsd %xmm1, %xmm2
; CHECK-NEXT:    mulsd %xmm2, %xmm0
; CHECK-NEXT:    addsd %xmm2, %xmm0
; CHECK-NEXT:    retq
  %div1 = fdiv fast double 1.0, %y
  %div2 = fdiv fast double %x, %y
  %ret = fadd fast double %div2, %div1
  ret double %ret
}

define void @PR24141() {
; CHECK-LABEL: PR24141:
; CHECK:	callq
; CHECK-NEXT:	divsd
; CHECK-NEXT:	jmp
entry:
  br label %while.body

while.body:
  %x.0 = phi double [ undef, %entry ], [ %div, %while.body ]
  %call = call { double, double } @g(double %x.0)
  %xv0 = extractvalue { double, double } %call, 0
  %xv1 = extractvalue { double, double } %call, 1
  %div = fdiv arcp double %xv0, %xv1
  br label %while.body
}

declare { double, double } @g(double)
[x86] enable machine combiner reassociations for scalar single-precision multiplies llvm-svn: 241752 2015-07-09 06:35:20 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 \| FileCheck %s`
[x86] Implement combineRepeatedFPDivisors Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012 2015-04-15 23:22:55 +08:00
Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`; More than one 'arcp' division using a single divisor operand`
[x86] Implement combineRepeatedFPDivisors Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012 2015-04-15 23:22:55 +08:00			`; should be converted into a reciprocal and multiplication.`

Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`; Don't do anything for just one division.`

			`define float @div1_arcp(float %x, float %y, float %z) {`
[x86] Implement combineRepeatedFPDivisors Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012 2015-04-15 23:22:55 +08:00			`; CHECK-LABEL: div1_arcp:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: divss %xmm1, %xmm0`
			`; CHECK-NEXT: retq`
			`%div1 = fdiv arcp float %x, %y`
			`ret float %div1`
			`}`

Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`; All math instructions are 'arcp', so optimize.`

			`define float @div2_arcp_all(float %x, float %y, float %z) {`
			`; CHECK-LABEL: div2_arcp_all:`
[x86] Implement combineRepeatedFPDivisors Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012 2015-04-15 23:22:55 +08:00			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero`
			`; CHECK-NEXT: divss %xmm2, %xmm3`
			`; CHECK-NEXT: mulss %xmm3, %xmm0`
Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`; CHECK-NEXT: mulss %xmm1, %xmm0`
[x86] enable machine combiner reassociations for scalar single-precision multiplies llvm-svn: 241752 2015-07-09 06:35:20 +08:00			`; CHECK-NEXT: mulss %xmm3, %xmm0`
[x86] Implement combineRepeatedFPDivisors Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012 2015-04-15 23:22:55 +08:00			`; CHECK-NEXT: retq`
			`%div1 = fdiv arcp float %x, %z`
			`%mul = fmul arcp float %div1, %y`
			`%div2 = fdiv arcp float %mul, %z`
			`ret float %div2`
			`}`

Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`; The first division is not 'arcp', so do not optimize.`

			`define float @div2_arcp_partial1(float %x, float %y, float %z) {`
			`; CHECK-LABEL: div2_arcp_partial1:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: divss %xmm2, %xmm0`
			`; CHECK-NEXT: mulss %xmm1, %xmm0`
			`; CHECK-NEXT: divss %xmm2, %xmm0`
			`; CHECK-NEXT: retq`
			`%div1 = fdiv float %x, %z`
			`%mul = fmul arcp float %div1, %y`
			`%div2 = fdiv arcp float %mul, %z`
			`ret float %div2`
			`}`

			`; The second division is not 'arcp', so do not optimize.`

			`define float @div2_arcp_partial2(float %x, float %y, float %z) {`
			`; CHECK-LABEL: div2_arcp_partial2:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: divss %xmm2, %xmm0`
			`; CHECK-NEXT: mulss %xmm1, %xmm0`
			`; CHECK-NEXT: divss %xmm2, %xmm0`
			`; CHECK-NEXT: retq`
			`%div1 = fdiv arcp float %x, %z`
			`%mul = fmul arcp float %div1, %y`
			`%div2 = fdiv float %mul, %z`
			`ret float %div2`
			`}`

			`; The multiply is not 'arcp', but that does not prevent optimizing the divisions.`

			`define float @div2_arcp_partial3(float %x, float %y, float %z) {`
			`; CHECK-LABEL: div2_arcp_partial3:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero`
			`; CHECK-NEXT: divss %xmm2, %xmm3`
			`; CHECK-NEXT: mulss %xmm3, %xmm0`
			`; CHECK-NEXT: mulss %xmm1, %xmm0`
			`; CHECK-NEXT: mulss %xmm3, %xmm0`
			`; CHECK-NEXT: retq`
			`%div1 = fdiv arcp float %x, %z`
			`%mul = fmul float %div1, %y`
			`%div2 = fdiv arcp float %mul, %z`
			`ret float %div2`
			`}`

fix an invisible bug when combining repeated FP divisors This patch fixes bugs that were exposed by the addition of fast-math-flags in the DAG: r237046 ( http://reviews.llvm.org/rL237046 ): 1. When replacing a division node, it's not enough to RAUW. We should call CombineTo() to delete dead nodes and combine again. 2. Because we are changing the DAG, we can't return an empty SDValue after the transform. As the code comments say: Visitation implementation - Implement dag node combining for different node types. The semantics are as follows: Return Value: SDValue.getNode() == 0 - No change was made SDValue.getNode() == N - N was replaced, is dead and has been handled. otherwise - N should be replaced by the returned Operand. The new test case shows no difference with or without this patch, but it will crash if we re-apply r237046 or enable FMF via the current -enable-fmf-dag cl::opt. Differential Revision: http://reviews.llvm.org/D9893 llvm-svn: 241826 2015-07-10 01:28:37 +08:00			`; If the reciprocal is already calculated, we should not`
			`; generate an extra multiplication by 1.0.`

Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`define double @div3_arcp(double %x, double %y, double %z) {`
fix an invisible bug when combining repeated FP divisors This patch fixes bugs that were exposed by the addition of fast-math-flags in the DAG: r237046 ( http://reviews.llvm.org/rL237046 ): 1. When replacing a division node, it's not enough to RAUW. We should call CombineTo() to delete dead nodes and combine again. 2. Because we are changing the DAG, we can't return an empty SDValue after the transform. As the code comments say: Visitation implementation - Implement dag node combining for different node types. The semantics are as follows: Return Value: SDValue.getNode() == 0 - No change was made SDValue.getNode() == N - N was replaced, is dead and has been handled. otherwise - N should be replaced by the returned Operand. The new test case shows no difference with or without this patch, but it will crash if we re-apply r237046 or enable FMF via the current -enable-fmf-dag cl::opt. Differential Revision: http://reviews.llvm.org/D9893 llvm-svn: 241826 2015-07-10 01:28:37 +08:00			`; CHECK-LABEL: div3_arcp:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movsd{{.*#+}} xmm2 = mem[0],zero`
			`; CHECK-NEXT: divsd %xmm1, %xmm2`
			`; CHECK-NEXT: mulsd %xmm2, %xmm0`
			`; CHECK-NEXT: addsd %xmm2, %xmm0`
			`; CHECK-NEXT: retq`
			`%div1 = fdiv fast double 1.0, %y`
			`%div2 = fdiv fast double %x, %y`
			`%ret = fadd fast double %div2, %div1`
			`ret double %ret`
			`}`

Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`define void @PR24141() {`
ignore duplicate divisor uses when transforming into reciprocal multiplies (PR24141) PR24141: https://llvm.org/bugs/show_bug.cgi?id=24141 contains a test case where we have duplicate entries in a node's uses() list. After r241826, we use CombineTo() to delete dead nodes when combining the uses into reciprocal multiplies, but this fails if we encounter the just-deleted node again in the list. The solution in this patch is to not add duplicate entries to the list of users that we will subsequently iterate over. For the test case, this avoids triggering the combine divisors logic entirely because there really is only one user of the divisor. Differential Revision: http://reviews.llvm.org/D11345 llvm-svn: 243500 2015-07-29 07:28:22 +08:00			`; CHECK-LABEL: PR24141:`
			`; CHECK: callq`
			`; CHECK-NEXT: divsd`
			`; CHECK-NEXT: jmp`
			`entry:`
			`br label %while.body`

			`while.body:`
			`%x.0 = phi double [ undef, %entry ], [ %div, %while.body ]`
			`%call = call { double, double } @g(double %x.0)`
			`%xv0 = extractvalue { double, double } %call, 0`
			`%xv1 = extractvalue { double, double } %call, 1`
Use the 'arcp' fast-math-flag when combining repeated FP divisors This is a usage of the IR-level fast-math-flags now that they are propagated to SDNodes. This was originally part of D8900. Removing the global 'enable-unsafe-fp-math' checks will require auto-upgrade and possibly other changes. Differential Revision: http://reviews.llvm.org/D9708 llvm-svn: 251450 2015-10-28 04:27:25 +08:00			`%div = fdiv arcp double %xv0, %xv1`
ignore duplicate divisor uses when transforming into reciprocal multiplies (PR24141) PR24141: https://llvm.org/bugs/show_bug.cgi?id=24141 contains a test case where we have duplicate entries in a node's uses() list. After r241826, we use CombineTo() to delete dead nodes when combining the uses into reciprocal multiplies, but this fails if we encounter the just-deleted node again in the list. The solution in this patch is to not add duplicate entries to the list of users that we will subsequently iterate over. For the test case, this avoids triggering the combine divisors logic entirely because there really is only one user of the divisor. Differential Revision: http://reviews.llvm.org/D11345 llvm-svn: 243500 2015-07-29 07:28:22 +08:00			`br label %while.body`
			`}`

			`declare { double, double } @g(double)`