diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index ad5fc5d848bf..2079d7690d00 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -2624,10 +2624,21 @@ public: return SDValue(); } - virtual SDValue BuildRSQRTE(SDValue Op, DAGCombinerInfo &DCI) const { + /// Hooks for building estimates in place of, for example, slower divisions + /// and square roots. These are not builder functions themselves, just the + /// target-specific variables needed for building the estimate algorithm. + + /// Return an estimate value for the input opcode and input operand. + /// The RefinementSteps output is the number of refinement iterations + /// required to generate a sufficient (though not necessarily IEEE-754 + /// compliant) estimate for the value type. + /// An empty SDValue return means no estimate sequence can be created. + virtual SDValue getEstimate(unsigned Opcode, SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { return SDValue(); } - + //===--------------------------------------------------------------------===// // Legalization utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 33e70593d154..34a0e04bc055 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -276,6 +276,7 @@ namespace { SDValue visitFMA(SDNode *N); SDValue visitFDIV(SDNode *N); SDValue visitFREM(SDNode *N); + SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); @@ -326,7 +327,8 @@ namespace { SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); - SDValue BuildRSQRTE(SDNode *N); + SDValue BuildReciprocalEstimate(SDValue Op); + SDValue BuildRsqrtEstimate(SDValue Op); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); @@ -1307,6 +1309,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FMA: return visitFMA(N); case ISD::FDIV: return visitFDIV(N); case ISD::FREM: return visitFREM(N); + case ISD::FSQRT: return visitFSQRT(N); case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); @@ -6976,6 +6979,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); + SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; // fold vector ops @@ -7007,10 +7011,37 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, DAG.getConstantFP(Recip, VT)); } + // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. - if (SDValue SqrtOp = BuildRSQRTE(N)) - return SqrtOp; + if (N1.getOpcode() == ISD::FSQRT) { + if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) { + AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + } + } else if (N1.getOpcode() == ISD::FP_EXTEND && + N1.getOperand(0).getOpcode() == ISD::FSQRT) { + if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { + AddToWorklist(RV.getNode()); + RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); + AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + } + } else if (N1.getOpcode() == ISD::FP_ROUND && + N1.getOperand(0).getOpcode() == ISD::FSQRT) { + if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { + AddToWorklist(RV.getNode()); + RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); + AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + } + } + + // Fold into a reciprocal estimate and multiply instead of a real divide. + if (SDValue RV = BuildReciprocalEstimate(N1)) { + AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + } } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) @@ -7042,6 +7073,33 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFSQRT(SDNode *N) { + if (DAG.getTarget().Options.UnsafeFPMath) { + // Compute this as 1/(1/sqrt(X)): the reciprocal of the reciprocal sqrt. + if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) { + AddToWorklist(RV.getNode()); + RV = BuildReciprocalEstimate(RV); + if (RV.getNode()) { + // Unfortunately, RV is now NaN if the input was exactly 0. + // Select out this case and force the answer to 0. + EVT VT = RV.getValueType(); + + SDValue Zero = DAG.getConstantFP(0.0, VT); + SDValue ZeroCmp = + DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT), + N->getOperand(0), Zero, ISD::SETEQ); + AddToWorklist(ZeroCmp.getNode()); + AddToWorklist(RV.getNode()); + + RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, + SDLoc(N), VT, ZeroCmp, Zero, RV); + return RV; + } + } + } + return SDValue(); +} + SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -11702,36 +11760,92 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { return S; } -/// Given an ISD::FDIV node with either a direct or indirect ISD::FSQRT operand, -/// generate a DAG expression using a reciprocal square root estimate op. -SDValue DAGCombiner::BuildRSQRTE(SDNode *N) { +SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) { + if (Level >= AfterLegalizeDAG) + return SDValue(); + // Expose the DAG combiner to the target combiner implementations. TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this); - SDLoc DL(N); - EVT VT = N->getValueType(0); - SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::FSQRT) { - if (SDValue RV = TLI.BuildRSQRTE(N1.getOperand(0), DCI)) { - AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV); + unsigned Iterations; + if (SDValue Est = TLI.getEstimate(ISD::FDIV, Op, DCI, Iterations)) { + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal, we need to find the zero of the function: + // F(X) = A X - 1 [which has a zero at X = 1/A] + // => + // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form + // does not require additional intermediate precision] + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue FPOne = DAG.getConstantFP(1.0, VT); + + AddToWorklist(Est.getNode()); + + // Newton iterations: Est = Est + Est (1 - Arg * Est) + for (unsigned i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); + AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst); + AddToWorklist(Est.getNode()); } - } else if (N1.getOpcode() == ISD::FP_EXTEND && - N1.getOperand(0).getOpcode() == ISD::FSQRT) { - if (SDValue RV = TLI.BuildRSQRTE(N1.getOperand(0).getOperand(0), DCI)) { - DCI.AddToWorklist(RV.getNode()); - RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); - AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV); - } - } else if (N1.getOpcode() == ISD::FP_ROUND && - N1.getOperand(0).getOpcode() == ISD::FSQRT) { - if (SDValue RV = TLI.BuildRSQRTE(N1.getOperand(0).getOperand(0), DCI)) { - DCI.AddToWorklist(RV.getNode()); - RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); - AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV); + + return Est; + } + + return SDValue(); +} + +SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) { + if (Level >= AfterLegalizeDAG) + return SDValue(); + + // Expose the DAG combiner to the target combiner implementations. + TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this); + unsigned Iterations; + if (SDValue Est = TLI.getEstimate(ISD::FSQRT, Op, DCI, Iterations)) { + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal sqrt, we need to find the zero of the function: + // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] + // => + // X_{i+1} = X_i (1.5 - A X_i^2 / 2) + // As a result, we precompute A/2 prior to the iteration loop. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue FPThreeHalves = DAG.getConstantFP(1.5, VT); + + AddToWorklist(Est.getNode()); + + // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that + // this entire sequence requires only one FP constant. + SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, FPThreeHalves, Op); + AddToWorklist(HalfArg.getNode()); + + HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Op); + AddToWorklist(HalfArg.getNode()); + + // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) + for (unsigned i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPThreeHalves, NewEst); + AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); + AddToWorklist(Est.getNode()); } + + return Est; } return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index e216a72c495c..5750e2fbb658 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7458,138 +7458,34 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Target Optimization Hooks //===----------------------------------------------------------------------===// -SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, - DAGCombinerInfo &DCI) const { - if (DCI.isAfterLegalizeVectorOps()) - return SDValue(); - - EVT VT = Op.getValueType(); - - if ((VT == MVT::f32 && Subtarget.hasFRES()) || - (VT == MVT::f64 && Subtarget.hasFRE()) || - (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX())) { - - // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) - // For the reciprocal, we need to find the zero of the function: - // F(X) = A X - 1 [which has a zero at X = 1/A] - // => - // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form - // does not require additional intermediate precision] - - // Convergence is quadratic, so we essentially double the number of digits - // correct after every iteration. The minimum architected relative - // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has - // 23 digits and double has 52 digits. - int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; - if (VT.getScalarType() == MVT::f64) - ++Iterations; - - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(Op); - - SDValue FPOne = - DAG.getConstantFP(1.0, VT.getScalarType()); - if (VT.isVector()) { - assert(VT.getVectorNumElements() == 4 && - "Unknown vector type"); - FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - FPOne, FPOne, FPOne, FPOne); - } - - SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op); - DCI.AddToWorklist(Est.getNode()); - - // Newton iterations: Est = Est + Est (1 - Arg * Est) - for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst); - DCI.AddToWorklist(Est.getNode()); - } - - return Est; +SDValue PPCTargetLowering::getEstimate(unsigned Opcode, SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + EVT VT = Operand.getValueType(); + SDValue RV; + if (Opcode == ISD::FSQRT) { + if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) + RV = DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); + } else if (Opcode == ISD::FDIV) { + if ((VT == MVT::f32 && Subtarget.hasFRES()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) + RV = DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); } - - return SDValue(); -} - -SDValue PPCTargetLowering::BuildRSQRTE(SDValue Op, DAGCombinerInfo &DCI) const { - if (DCI.isAfterLegalizeVectorOps()) - return SDValue(); - - EVT VT = Op.getValueType(); - - if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || - (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || - (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX())) { - - // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) - // For the reciprocal sqrt, we need to find the zero of the function: - // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] - // => - // X_{i+1} = X_i (1.5 - A X_i^2 / 2) - // As a result, we precompute A/2 prior to the iteration loop. - + if (RV.getNode()) { // Convergence is quadratic, so we essentially double the number of digits - // correct after every iteration. The minimum architected relative - // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has - // 23 digits and double has 52 digits. - int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; + // correct after every iteration. For both FRE and FRSQRTE, the minimum + // architected relative accuracy is 2^-5. When hasRecipPrec(), this is + // 2^-14. IEEE float has 23 digits and double has 52 digits. + RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) - ++Iterations; - - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(Op); - - SDValue FPThreeHalves = - DAG.getConstantFP(1.5, VT.getScalarType()); - if (VT.isVector()) { - assert(VT.getVectorNumElements() == 4 && - "Unknown vector type"); - FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - FPThreeHalves, FPThreeHalves, - FPThreeHalves, FPThreeHalves); - } - - SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op); - DCI.AddToWorklist(Est.getNode()); - - // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that - // this entire sequence requires only one FP constant. - SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op); - DCI.AddToWorklist(HalfArg.getNode()); - - HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op); - DCI.AddToWorklist(HalfArg.getNode()); - - // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) - for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst); - DCI.AddToWorklist(NewEst.getNode()); - - Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); - DCI.AddToWorklist(Est.getNode()); - } - - return Est; + ++RefinementSteps; } - - return SDValue(); + return RV; } static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, @@ -8316,55 +8212,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); - case ISD::FDIV: { - assert(TM.Options.UnsafeFPMath && - "Reciprocal estimates require UnsafeFPMath"); - - SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), - N->getOperand(0), RV); - } - - } - break; - case ISD::FSQRT: { - assert(TM.Options.UnsafeFPMath && - "Reciprocal estimates require UnsafeFPMath"); - - // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the - // reciprocal sqrt. - SDValue RV = BuildRSQRTE(N->getOperand(0), DCI); - if (RV.getNode()) { - DCI.AddToWorklist(RV.getNode()); - RV = DAGCombineFastRecip(RV, DCI); - if (RV.getNode()) { - // Unfortunately, RV is now NaN if the input was exactly 0. Select out - // this case and force the answer to 0. - - EVT VT = RV.getValueType(); - - SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType()); - if (VT.isVector()) { - assert(VT.getVectorNumElements() == 4 && "Unknown vector type"); - Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero); - } - - SDValue ZeroCmp = - DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT), - N->getOperand(0), Zero, ISD::SETEQ); - DCI.AddToWorklist(ZeroCmp.getNode()); - DCI.AddToWorklist(RV.getNode()); - - RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT, - ZeroCmp, Zero, RV); - return RV; - } - } - - } - break; case ISD::SINT_TO_FP: if (TM.getSubtarget().has64BitSupport()) { if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 7444d41bbc47..197d97779b19 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -700,8 +700,10 @@ namespace llvm { SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const; SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const; - SDValue BuildRSQRTE(SDValue Op, DAGCombinerInfo &DCI) const; + + SDValue getEstimate(unsigned Opcode, SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; CCAssignFn *useFastISelCCs(unsigned Flag) const; }; diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll index 8111b5443664..d9c5d4061c84 100644 --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -16,12 +16,12 @@ define double @foo(double %a, double %b) nounwind { ; CHECK-DAG: frsqrte ; CHECK-DAG: fnmsub ; CHECK: fmul -; CHECK: fmadd -; CHECK: fmul -; CHECK: fmul -; CHECK: fmadd -; CHECK: fmul -; CHECK: fmul +; CHECK-NEXT: fmadd +; CHECK-NEXT: fmul +; CHECK-NEXT: fmul +; CHECK-NEXT: fmadd +; CHECK-NEXT: fmul +; CHECK-NEXT: fmul ; CHECK: blr ; CHECK-SAFE: @foo @@ -85,10 +85,10 @@ define float @goo(float %a, float %b) nounwind { ; CHECK-DAG: frsqrtes ; CHECK-DAG: fnmsubs ; CHECK: fmuls -; CHECK: fmadds -; CHECK: fmuls -; CHECK: fmuls -; CHECK: blr +; CHECK-NEXT: fmadds +; CHECK-NEXT: fmuls +; CHECK-NEXT: fmuls +; CHECK-NEXT: blr ; CHECK-SAFE: @goo ; CHECK-SAFE: fsqrts @@ -117,10 +117,10 @@ define double @foo2(double %a, double %b) nounwind { ; CHECK-DAG: fre ; CHECK-DAG: fnmsub ; CHECK: fmadd -; CHECK: fnmsub -; CHECK: fmadd -; CHECK: fmul -; CHECK: blr +; CHECK-NEXT: fnmsub +; CHECK-NEXT: fmadd +; CHECK-NEXT: fmul +; CHECK-NEXT: blr ; CHECK-SAFE: @foo2 ; CHECK-SAFE: fdiv @@ -135,8 +135,8 @@ define float @goo2(float %a, float %b) nounwind { ; CHECK-DAG: fres ; CHECK-DAG: fnmsubs ; CHECK: fmadds -; CHECK: fmuls -; CHECK: blr +; CHECK-NEXT: fmuls +; CHECK-NEXT: blr ; CHECK-SAFE: @goo2 ; CHECK-SAFE: fdivs @@ -164,16 +164,16 @@ define double @foo3(double %a) nounwind { ; CHECK-DAG: frsqrte ; CHECK-DAG: fnmsub ; CHECK: fmul -; CHECK: fmadd -; CHECK: fmul -; CHECK: fmul -; CHECK: fmadd -; CHECK: fmul -; CHECK: fre -; CHECK: fnmsub -; CHECK: fmadd -; CHECK: fnmsub -; CHECK: fmadd +; CHECK-NEXT: fmadd +; CHECK-NEXT: fmul +; CHECK-NEXT: fmul +; CHECK-NEXT: fmadd +; CHECK-NEXT: fmul +; CHECK-NEXT: fre +; CHECK-NEXT: fnmsub +; CHECK-NEXT: fmadd +; CHECK-NEXT: fnmsub +; CHECK-NEXT: fmadd ; CHECK: blr ; CHECK-SAFE: @foo3 @@ -190,11 +190,11 @@ define float @goo3(float %a) nounwind { ; CHECK-DAG: frsqrtes ; CHECK-DAG: fnmsubs ; CHECK: fmuls -; CHECK: fmadds -; CHECK: fmuls -; CHECK: fres -; CHECK: fnmsubs -; CHECK: fmadds +; CHECK-NEXT: fmadds +; CHECK-NEXT: fmuls +; CHECK-NEXT: fres +; CHECK-NEXT: fnmsubs +; CHECK-NEXT: fmadds ; CHECK: blr ; CHECK-SAFE: @goo3