AMDGPU: Use more accurate fast f64 fdiv

A raw v_rcp_f64 isn't accurate enough, so start applying correction.
2021-01-20 13:55:55 -05:00 · 2021-01-20 13:55:55 -05:00 · 2a0db8d70e
parent 48c54f0f62
commit 2a0db8d70e
13 changed files with 367 additions and 107 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -739,6 +739,11 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {

  Type *Ty = FDiv.getType()->getScalarType();

+  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
+  // expansion around them in codegen.
+  if (Ty->isDoubleTy())
+    return false;
+
  // No intrinsic for fdiv16 if target does not support f16.
  if (Ty->isHalfTy() && !ST->has16BitInsts())
    return false;
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@ -2752,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
  LLT S32 = LLT::scalar(32);
  LLT S64 = LLT::scalar(64);

-  if (legalizeFastUnsafeFDIV(MI, MRI, B))
-    return true;
-
  if (DstTy == S16)
    return legalizeFDIV16(MI, MRI, B);
  if (DstTy == S32)
@ -3092,9 +3089,49 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
  return true;
 }

+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
+                                                   MachineRegisterInfo &MRI,
+                                                   MachineIRBuilder &B) const {
+  Register Res = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  uint16_t Flags = MI.getFlags();
+  LLT ResTy = MRI.getType(Res);
+
+  const MachineFunction &MF = B.getMF();
+  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+                            MI.getFlag(MachineInstr::FmAfn);
+
+  if (!AllowInaccurateRcp)
+    return false;
+
+  auto NegY = B.buildFNeg(ResTy, Y);
+  auto One = B.buildFConstant(ResTy, 1.0);
+
+  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+    .addUse(Y)
+    .setMIFlags(Flags);
+
+  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
+  R = B.buildFMA(ResTy, Tmp0, R, R);
+
+  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
+  R = B.buildFMA(ResTy, Tmp1, R, R);
+
+  auto Ret = B.buildFMul(ResTy, X, R);
+  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
+
+  B.buildFMA(Res, Tmp2, R, Ret);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
                                         MachineRegisterInfo &MRI,
                                         MachineIRBuilder &B) const {
+  if (legalizeFastUnsafeFDIV(MI, MRI, B))
+    return true;
+
  Register Res = MI.getOperand(0).getReg();
  Register LHS = MI.getOperand(1).getReg();
  Register RHS = MI.getOperand(2).getReg();
@ -3157,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable,
 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
                                         MachineRegisterInfo &MRI,
                                         MachineIRBuilder &B) const {
+  if (legalizeFastUnsafeFDIV(MI, MRI, B))
+    return true;
+
  Register Res = MI.getOperand(0).getReg();
  Register LHS = MI.getOperand(1).getReg();
  Register RHS = MI.getOperand(2).getReg();
@ -3223,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
                                         MachineRegisterInfo &MRI,
                                         MachineIRBuilder &B) const {
+  if (legalizeFastUnsafeFDIV64(MI, MRI, B))
+    return true;
+
  Register Res = MI.getOperand(0).getReg();
  Register LHS = MI.getOperand(1).getReg();
  Register RHS = MI.getOperand(2).getReg();
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@ -128,6 +128,8 @@ public:
                      MachineIRBuilder &B) const;
  bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineIRBuilder &B) const;
+  bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                MachineIRBuilder &B) const;
  bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineIRBuilder &B) const;

--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -8212,6 +8212,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
 }

+SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  const SDNodeFlags Flags = Op->getFlags();
+
+  bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
+                            DAG.getTarget().Options.UnsafeFPMath;
+  if (!AllowInaccurateDiv)
+    return SDValue();
+
+  SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
+  SDValue One = DAG.getConstantFP(1.0, SL, VT);
+
+  SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
+  SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+
+  R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
+  SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+  R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
+  SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
+  SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
+  return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
+}
+
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                          EVT VT, SDValue A, SDValue B, SDValue GlueChain,
                          SDNodeFlags Flags) {
@ -8440,8 +8467,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 }

 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getTarget().Options.UnsafeFPMath)
-    return lowerFastUnsafeFDIV(Op, DAG);
+  if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
+    return FastLowered;

  SDLoc SL(Op);
  SDValue X = Op.getOperand(0);
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@ -92,6 +92,7 @@ private:
  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV64(SDValue Op, SelectionDAG &DAG) const;
  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -786,7 +786,6 @@ def : Pat <

 let OtherPredicates = [UnsafeFPMath] in {

-//def : RcpPat<V_RCP_F64_e32, f64>;
 //defm : RsqPat<V_RSQ_F64_e32, f64>;
 //defm : RsqPat<V_RSQ_F32_e32, f32>;

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@ -67,8 +67,14 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
 ; GCN-LABEL: v_fdiv_f64_afn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GCN-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GCN-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv afn double %a, %b
  ret double %fdiv
@ -245,7 +251,14 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GCN-LABEL: v_rcp_f64_arcp_afn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[0:1], v[0:1]
+; GCN-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; GCN-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; GCN-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; GCN-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; GCN-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv arcp afn double 1.0, %x
  ret double %fdiv
@ -311,8 +324,14 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 ; GCN-LABEL: v_fdiv_f64_afn_ulp25:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GCN-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GCN-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv afn double %a, %b, !fpmath !0
  ret double %fdiv
@ -471,10 +490,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 ; GCN-LABEL: v_fdiv_v2f64_afn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[4:5]
-; GCN-NEXT:    v_rcp_f64_e32 v[6:7], v[6:7]
-; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GCN-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GCN-NEXT:    v_mul_f64 v[12:13], v[0:1], v[8:9]
+; GCN-NEXT:    v_mul_f64 v[14:15], v[2:3], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
+; GCN-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
+; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv afn <2 x double> %a, %b
  ret <2 x double> %fdiv
@ -766,8 +797,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GCN-LABEL: v_rcp_v2f64_arcp_afn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_rcp_f64_e32 v[2:3], v[2:3]
+; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; GCN-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; GCN-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; GCN-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GCN-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; GCN-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; GCN-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; GCN-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GCN-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
+; GCN-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv arcp afn <2 x double> <double 1.0, double 1.0>, %x
  ret <2 x double> %fdiv
@ -871,10 +916,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GCN-LABEL: v_fdiv_v2f64_afn_ulp25:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[4:5]
-; GCN-NEXT:    v_rcp_f64_e32 v[6:7], v[6:7]
-; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GCN-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GCN-NEXT:    v_mul_f64 v[12:13], v[0:1], v[8:9]
+; GCN-NEXT:    v_mul_f64 v[14:15], v[2:3], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
+; GCN-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
+; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0
  ret <2 x double> %fdiv
@ -978,10 +1035,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
 ; GCN-LABEL: v_fdiv_v2f64_arcp_afn_ulp25:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[4:5]
-; GCN-NEXT:    v_rcp_f64_e32 v[6:7], v[6:7]
-; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GCN-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GCN-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GCN-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GCN-NEXT:    v_mul_f64 v[12:13], v[0:1], v[8:9]
+; GCN-NEXT:    v_mul_f64 v[14:15], v[2:3], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
+; GCN-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
+; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0
  ret <2 x double> %fdiv
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@ -410,10 +410,16 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; CI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; CI-NEXT:    v_mul_f64 v[4:5], s[0:1], v[0:1]
+; CI-NEXT:    v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
 ; CI-NEXT:    v_mov_b32_e32 v2, s4
@ -429,10 +435,16 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; VI-NEXT:    v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT:    v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@ -455,10 +467,16 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; CI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; CI-NEXT:    v_mul_f64 v[4:5], s[0:1], v[0:1]
+; CI-NEXT:    v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
 ; CI-NEXT:    v_mov_b32_e32 v2, s4
@ -474,10 +492,16 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; VI-NEXT:    v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT:    v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
@ -467,9 +467,17 @@ body: |
    ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64
    ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
+    ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
    ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s64)
-    ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[INT]]
-    ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMUL]](s64)
+    ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
+    ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
+    ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
+    ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
+    ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMA3]]
+    ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]]
+    ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
+    ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
    ; GFX10-LABEL: name: test_fdiv_s64
    ; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX10: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@ -1140,11 +1148,26 @@ body: |
    ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
    ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
    ; GFX9-UNSAFE: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
+    ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[UV2]]
+    ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
    ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s64)
-    ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[INT]]
+    ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
+    ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
+    ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
+    ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
+    ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[FMA3]]
+    ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[UV]]
+    ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
+    ; GFX9-UNSAFE: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV3]]
    ; GFX9-UNSAFE: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s64)
-    ; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[INT1]]
-    ; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMUL]](s64), [[FMUL1]](s64)
+    ; GFX9-UNSAFE: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[INT1]], [[C]]
+    ; GFX9-UNSAFE: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FMA6]], [[INT1]], [[INT1]]
+    ; GFX9-UNSAFE: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA7]], [[C]]
+    ; GFX9-UNSAFE: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMA8]], [[FMA7]], [[FMA7]]
+    ; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[FMA9]]
+    ; GFX9-UNSAFE: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMUL1]], [[UV1]]
+    ; GFX9-UNSAFE: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]]
+    ; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64)
    ; GFX9-UNSAFE: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
    ; GFX10-LABEL: name: test_fdiv_v2s64
    ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
@ -2312,9 +2335,18 @@ body: |
    ; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
    ; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64)
    ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp
+    ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
    ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
    ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64)
-    ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64)
+    ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
+    ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
+    ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
+    ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
+    ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]]
+    ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
+    ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
+    ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
    ; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp
    ; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
    ; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
@ -2409,10 +2441,19 @@ body: |
    ; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
    ; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64)
    ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
+    ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00
    ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
-    ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s64)
-    ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64)
+    ; GFX9-UNSAFE: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64)
+    ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C1]]
+    ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
+    ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C1]]
+    ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
+    ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]]
+    ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
+    ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
+    ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
    ; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
    ; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00
    ; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@ -38,6 +38,35 @@ define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(
  ret void
 }

+; GCN-LABEL: {{^}}v_fdiv_f64_afn:
+; GCN: v_rcp_f64_e32 v[4:5], v[2:3]
+; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GCN: v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GCN: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
+; GCN: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GCN: s_setpc_b64
+define double @v_fdiv_f64_afn(double %x, double %y) #0 {
+  %result = fdiv afn double %x, %y
+  ret double %result
+}
+
+; GCN-LABEL: {{^}}v_rcp_f64_afn:
+; GCN: v_rcp_f64_e32 v[2:3], v[0:1]
+; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; GCN: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; GCN: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; GCN: s_setpc_b64
+define double @v_rcp_f64_afn(double %x) #0 {
+  %result = fdiv afn double 1.0, %x
+  ret double %result
+}
+
 ; GCN-LABEL: {{^}}fdiv_f64_s_v:
 define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
  %den = load double, double addrspace(1)* %in
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@ -711,41 +711,34 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI-LABEL: fast_frem_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s15, 0xf000
+; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s8
-; SI-NEXT:    s_mov_b32 s5, s9
-; SI-NEXT:    s_mov_b32 s0, s10
-; SI-NEXT:    s_mov_b32 s1, s11
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    s_mov_b32 s14, s6
-; SI-NEXT:    s_mov_b32 s15, s7
+; SI-NEXT:    s_mov_b32 s12, s4
+; SI-NEXT:    s_mov_b32 s13, s5
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_mov_b32 s1, s7
+; SI-NEXT:    s_mov_b32 s2, s14
+; SI-NEXT:    s_mov_b32 s3, s15
+; SI-NEXT:    s_mov_b32 s10, s14
+; SI-NEXT:    s_mov_b32 s11, s15
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
-; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
-; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
-; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-NEXT:    s_nop 1
-; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
-; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
 ; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_mov_b32 s0, s14
 ; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
 ; SI-NEXT:    v_not_b32_e32 v6, v6
 ; SI-NEXT:    v_and_b32_e32 v6, v4, v6
@ -759,7 +752,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: fast_frem_f64:
@ -780,18 +773,14 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
-; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
-; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
-; CI-NEXT:    s_nop 1
-; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
-; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@ -811,18 +800,14 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
-; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
-; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
-; VI-NEXT:    s_nop 1
-; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
+; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
+; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
+; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
 ; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
 ; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
@ -855,7 +840,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
 ; SI-NEXT:    s_mov_b32 s1, 0xfffff
@ -895,7 +886,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@ -916,7 +913,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; VI-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
+; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
+; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
 ; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
 ; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@ -107,9 +107,13 @@ define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #
 }

 ; FUNC-LABEL: {{^}}unsafe_rcp_pat_f64:
-; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: v_rcp_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
 define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
  %rcp = fdiv double 1.0, %src
  store double %rcp, double addrspace(1)* %out, align 8
--- a/llvm/test/CodeGen/AMDGPU/rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.ll
@ -95,9 +95,15 @@ define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float a
 ; SI-SAFE: v_sqrt_f64_e32
 ; SI-SAFE: v_div_scale_f64

-; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
-; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
-; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
+; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
 define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 {
  %val = load double, double addrspace(1)* %in, align 4
  %sqrt = call double @llvm.sqrt.f64(double %val)
@ -127,9 +133,16 @@ define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, flo
 ; SI-SAFE: v_sqrt_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 ; SI-SAFE: v_div_scale_f64

-; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
-; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
-; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
+; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; SI-UNSAFE-DAG: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -[[VAL]]
+; SI-UNSAFE-DAG: v_xor_b32_e32 v[[HI:[0-9]+]], 0x80000000, v{{[0-9]+}}
+; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI]]{{\]}}
+; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
+; SI-UNSAFE: v_fma_f64
 define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 {
  %val = load double, double addrspace(1)* %in, align 4
  %val.fneg = fsub double -0.0, %val