forked from OSchip/llvm-project
[x86] Implement combineRepeatedFPDivisors
Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012
This commit is contained in:
parent
280d8dc9f0
commit
7024b8121a
|
@ -12818,6 +12818,16 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
/// If we have at least two divisions that use the same divisor, convert to
|
||||
/// multplication by a reciprocal. This may need to be adjusted for a given
|
||||
/// CPU if a division's cost is not at least twice the cost of a multiplication.
|
||||
/// This is because we still need one division to calculate the reciprocal and
|
||||
/// then we need two multiplies by that reciprocal as replacements for the
|
||||
/// original divisions.
|
||||
bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
|
||||
return NumUsers > 1;
|
||||
}
|
||||
|
||||
static bool isAllOnes(SDValue V) {
|
||||
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
|
||||
return C && C->isAllOnesValue();
|
||||
|
|
|
@ -1072,6 +1072,9 @@ namespace llvm {
|
|||
/// Use rcp* to speed up fdiv calculations.
|
||||
SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
|
||||
unsigned &RefinementSteps) const override;
|
||||
|
||||
/// Reassociate floating point divisions into multiply by reciprocal.
|
||||
bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
|
||||
};
|
||||
|
||||
namespace X86 {
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
|
||||
|
||||
; Anything more than one division using a single divisor operand
|
||||
; should be converted into a reciprocal and multiplication.
|
||||
|
||||
define float @div1_arcp(float %x, float %y, float %z) #0 {
|
||||
; CHECK-LABEL: div1_arcp:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: divss %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%div1 = fdiv arcp float %x, %y
|
||||
ret float %div1
|
||||
}
|
||||
|
||||
define float @div2_arcp(float %x, float %y, float %z) #0 {
|
||||
; CHECK-LABEL: div2_arcp:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; CHECK-NEXT: divss %xmm2, %xmm3
|
||||
; CHECK-NEXT: mulss %xmm3, %xmm0
|
||||
; CHECK-NEXT: mulss %xmm1, %xmm0
|
||||
; CHECK-NEXT: mulss %xmm3, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%div1 = fdiv arcp float %x, %z
|
||||
%mul = fmul arcp float %div1, %y
|
||||
%div2 = fdiv arcp float %mul, %z
|
||||
ret float %div2
|
||||
}
|
||||
|
||||
; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
|
||||
attributes #0 = { "unsafe-fp-math"="true" }
|
Loading…
Reference in New Issue