AMDGPU: Use more accurate fast f64 fdiv

A raw v_rcp_f64 isn't accurate enough, so start applying correction.
This commit is contained in:
Matt Arsenault 2021-01-20 13:55:55 -05:00
parent 48c54f0f62
commit 2a0db8d70e
13 changed files with 367 additions and 107 deletions

View File

@ -739,6 +739,11 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Type *Ty = FDiv.getType()->getScalarType();
// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
// expansion around them in codegen.
if (Ty->isDoubleTy())
return false;
// No intrinsic for fdiv16 if target does not support f16.
if (Ty->isHalfTy() && !ST->has16BitInsts())
return false;

View File

@ -2752,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);
if (legalizeFastUnsafeFDIV(MI, MRI, B))
return true;
if (DstTy == S16)
return legalizeFDIV16(MI, MRI, B);
if (DstTy == S32)
@ -3092,9 +3089,49 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
return true;
}
bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Res = MI.getOperand(0).getReg();
Register X = MI.getOperand(1).getReg();
Register Y = MI.getOperand(2).getReg();
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);
const MachineFunction &MF = B.getMF();
bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
MI.getFlag(MachineInstr::FmAfn);
if (!AllowInaccurateRcp)
return false;
auto NegY = B.buildFNeg(ResTy, Y);
auto One = B.buildFConstant(ResTy, 1.0);
auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
.addUse(Y)
.setMIFlags(Flags);
auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
R = B.buildFMA(ResTy, Tmp0, R, R);
auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
R = B.buildFMA(ResTy, Tmp1, R, R);
auto Ret = B.buildFMul(ResTy, X, R);
auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
B.buildFMA(Res, Tmp2, R, Ret);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
if (legalizeFastUnsafeFDIV(MI, MRI, B))
return true;
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@ -3157,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable,
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
if (legalizeFastUnsafeFDIV(MI, MRI, B))
return true;
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@ -3223,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
if (legalizeFastUnsafeFDIV64(MI, MRI, B))
return true;
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();

View File

@ -128,6 +128,8 @@ public:
MachineIRBuilder &B) const;
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

View File

@ -8212,6 +8212,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
}
SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
DAG.getTarget().Options.UnsafeFPMath;
if (!AllowInaccurateDiv)
return SDValue();
SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
SDValue One = DAG.getConstantFP(1.0, SL, VT);
SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
}
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
EVT VT, SDValue A, SDValue B, SDValue GlueChain,
SDNodeFlags Flags) {
@ -8440,8 +8467,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
if (DAG.getTarget().Options.UnsafeFPMath)
return lowerFastUnsafeFDIV(Op, DAG);
if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
return FastLowered;
SDLoc SL(Op);
SDValue X = Op.getOperand(0);

View File

@ -92,6 +92,7 @@ private:
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;

View File

@ -786,7 +786,6 @@ def : Pat <
let OtherPredicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F32_e32, f32>;

View File

@ -67,8 +67,14 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
; GCN-LABEL: v_fdiv_f64_afn:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3]
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GCN-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn double %a, %b
ret double %fdiv
@ -245,7 +251,14 @@ define double @v_rcp_f64_arcp_afn(double %x) {
; GCN-LABEL: v_rcp_f64_arcp_afn:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[0:1], v[0:1]
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
; GCN-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
; GCN-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
; GCN-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp afn double 1.0, %x
ret double %fdiv
@ -311,8 +324,14 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
; GCN-LABEL: v_fdiv_f64_afn_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3]
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GCN-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn double %a, %b, !fpmath !0
ret double %fdiv
@ -471,10 +490,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
; GCN-LABEL: v_fdiv_v2f64_afn:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5]
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7]
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x double> %a, %b
ret <2 x double> %fdiv
@ -766,8 +797,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
; GCN-LABEL: v_rcp_v2f64_arcp_afn:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[0:1], v[0:1]
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3]
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
; GCN-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
; GCN-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
; GCN-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
; GCN-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
; GCN-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
; GCN-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5]
; GCN-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7]
; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp afn <2 x double> <double 1.0, double 1.0>, %x
ret <2 x double> %fdiv
@ -871,10 +916,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
; GCN-LABEL: v_fdiv_v2f64_afn_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5]
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7]
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0
ret <2 x double> %fdiv
@ -978,10 +1035,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
; GCN-LABEL: v_fdiv_v2f64_arcp_afn_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5]
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7]
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
; GCN-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0
ret <2 x double> %fdiv

View File

@ -410,10 +410,16 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
; CI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
; CI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; CI-NEXT: v_mov_b32_e32 v2, s4
@ -429,10 +435,16 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
@ -455,10 +467,16 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
; CI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
; CI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; CI-NEXT: v_mov_b32_e32 v2, s4
@ -474,10 +492,16 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4

View File

@ -467,9 +467,17 @@ body: |
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s64)
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMUL]](s64)
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMA3]]
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]]
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
; GFX10-LABEL: name: test_fdiv_s64
; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@ -1140,11 +1148,26 @@ body: |
; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
; GFX9-UNSAFE: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[UV2]]
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s64)
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[INT]]
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[FMA3]]
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[UV]]
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV3]]
; GFX9-UNSAFE: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s64)
; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[INT1]]
; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMUL]](s64), [[FMUL1]](s64)
; GFX9-UNSAFE: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[INT1]], [[C]]
; GFX9-UNSAFE: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FMA6]], [[INT1]], [[INT1]]
; GFX9-UNSAFE: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA7]], [[C]]
; GFX9-UNSAFE: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMA8]], [[FMA7]], [[FMA7]]
; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[FMA9]]
; GFX9-UNSAFE: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMUL1]], [[UV1]]
; GFX9-UNSAFE: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]]
; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64)
; GFX9-UNSAFE: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; GFX10-LABEL: name: test_fdiv_v2s64
; GFX10: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
@ -2312,9 +2335,18 @@ body: |
; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64)
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64)
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]]
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
@ -2409,10 +2441,19 @@ body: |
; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s64)
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64)
; GFX9-UNSAFE: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64)
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C1]]
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C1]]
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]]
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00
; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1

View File

@ -38,6 +38,35 @@ define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(
ret void
}
; GCN-LABEL: {{^}}v_fdiv_f64_afn:
; GCN: v_rcp_f64_e32 v[4:5], v[2:3]
; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GCN: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GCN: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
; GCN: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
; GCN: s_setpc_b64
define double @v_fdiv_f64_afn(double %x, double %y) #0 {
%result = fdiv afn double %x, %y
ret double %result
}
; GCN-LABEL: {{^}}v_rcp_f64_afn:
; GCN: v_rcp_f64_e32 v[2:3], v[0:1]
; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
; GCN: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
; GCN: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
; GCN: s_setpc_b64
define double @v_rcp_f64_afn(double %x) #0 {
%result = fdiv afn double 1.0, %x
ret double %result
}
; GCN-LABEL: {{^}}fdiv_f64_s_v:
define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
%den = load double, double addrspace(1)* %in

View File

@ -711,41 +711,34 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
; SI-LABEL: fast_frem_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s8
; SI-NEXT: s_mov_b32 s5, s9
; SI-NEXT: s_mov_b32 s0, s10
; SI-NEXT: s_mov_b32 s1, s11
; SI-NEXT: s_mov_b32 s2, s6
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_mov_b32 s2, s14
; SI-NEXT: s_mov_b32 s3, s15
; SI-NEXT: s_mov_b32 s10, s14
; SI-NEXT: s_mov_b32 s11, s15
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9
; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
; SI-NEXT: s_nop 1
; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
; SI-NEXT: s_mov_b32 s1, 0xfffff
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s0, s14
; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8
; SI-NEXT: v_not_b32_e32 v6, v6
; SI-NEXT: v_and_b32_e32 v6, v4, v6
@ -759,7 +752,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1]
; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; CI-LABEL: fast_frem_f64:
@ -780,18 +773,14 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
; CI-NEXT: s_nop 1
; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@ -811,18 +800,14 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
; VI-NEXT: s_nop 1
; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@ -855,7 +840,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
; SI-NEXT: s_mov_b32 s1, 0xfffff
@ -895,7 +886,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; CI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@ -916,7 +913,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]

View File

@ -107,9 +107,13 @@ define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #
}
; FUNC-LABEL: {{^}}unsafe_rcp_pat_f64:
; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
; SI-NOT: [[RESULT]]
; SI: buffer_store_dwordx2 [[RESULT]]
; SI: v_rcp_f64
; SI: v_fma_f64
; SI: v_fma_f64
; SI: v_fma_f64
; SI: v_fma_f64
; SI: v_fma_f64
; SI: v_fma_f64
define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
%rcp = fdiv double 1.0, %src
store double %rcp, double addrspace(1)* %out, align 8

View File

@ -95,9 +95,15 @@ define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float a
; SI-SAFE: v_sqrt_f64_e32
; SI-SAFE: v_div_scale_f64
; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]]
; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[VAL]]
; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 {
%val = load double, double addrspace(1)* %in, align 4
%sqrt = call double @llvm.sqrt.f64(double %val)
@ -127,9 +133,16 @@ define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, flo
; SI-SAFE: v_sqrt_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
; SI-SAFE: v_div_scale_f64
; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
; SI-UNSAFE-DAG: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -[[VAL]]
; SI-UNSAFE-DAG: v_xor_b32_e32 v[[HI:[0-9]+]], 0x80000000, v{{[0-9]+}}
; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI]]{{\]}}
; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
; SI-UNSAFE: v_fma_f64
define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 {
%val = load double, double addrspace(1)* %in, align 4
%val.fneg = fsub double -0.0, %val