From 7024b8121a9e51d468302e43ed41aeb6e1fb7274 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 15 Apr 2015 15:22:55 +0000 Subject: [PATCH] [x86] Implement combineRepeatedFPDivisors Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++++ llvm/lib/Target/X86/X86ISelLowering.h | 3 +++ llvm/test/CodeGen/X86/fdiv-combine.ll | 31 +++++++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 llvm/test/CodeGen/X86/fdiv-combine.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1c60237f75b0..c32412a741c8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12818,6 +12818,16 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, return SDValue(); } +/// If we have at least two divisions that use the same divisor, convert to +/// multplication by a reciprocal. This may need to be adjusted for a given +/// CPU if a division's cost is not at least twice the cost of a multiplication. +/// This is because we still need one division to calculate the reciprocal and +/// then we need two multiplies by that reciprocal as replacements for the +/// original divisions. +bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { + return NumUsers > 1; +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index dd20ec23976c..5130c37b0428 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1072,6 +1072,9 @@ namespace llvm { /// Use rcp* to speed up fdiv calculations. SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const override; + + /// Reassociate floating point divisions into multiply by reciprocal. + bool combineRepeatedFPDivisors(unsigned NumUsers) const override; }; namespace X86 { diff --git a/llvm/test/CodeGen/X86/fdiv-combine.ll b/llvm/test/CodeGen/X86/fdiv-combine.ll new file mode 100644 index 000000000000..279bb0624ace --- /dev/null +++ b/llvm/test/CodeGen/X86/fdiv-combine.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Anything more than one division using a single divisor operand +; should be converted into a reciprocal and multiplication. + +define float @div1_arcp(float %x, float %y, float %z) #0 { +; CHECK-LABEL: div1_arcp: +; CHECK: # BB#0: +; CHECK-NEXT: divss %xmm1, %xmm0 +; CHECK-NEXT: retq + %div1 = fdiv arcp float %x, %y + ret float %div1 +} + +define float @div2_arcp(float %x, float %y, float %z) #0 { +; CHECK-LABEL: div2_arcp: +; CHECK: # BB#0: +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: divss %xmm2, %xmm3 +; CHECK-NEXT: mulss %xmm3, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm0 +; CHECK-NEXT: mulss %xmm3, %xmm0 +; CHECK-NEXT: retq + %div1 = fdiv arcp float %x, %z + %mul = fmul arcp float %div1, %y + %div2 = fdiv arcp float %mul, %z + ret float %div2 +} + +; FIXME: If the backend understands 'arcp', then this attribute is unnecessary. +attributes #0 = { "unsafe-fp-math"="true" }