forked from OSchip/llvm-project
AMDGPU: Use more accurate fast f64 fdiv
A raw v_rcp_f64 isn't accurate enough, so start applying correction.
This commit is contained in:
parent
48c54f0f62
commit
2a0db8d70e
|
@ -739,6 +739,11 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
|
|||
|
||||
Type *Ty = FDiv.getType()->getScalarType();
|
||||
|
||||
// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
|
||||
// expansion around them in codegen.
|
||||
if (Ty->isDoubleTy())
|
||||
return false;
|
||||
|
||||
// No intrinsic for fdiv16 if target does not support f16.
|
||||
if (Ty->isHalfTy() && !ST->has16BitInsts())
|
||||
return false;
|
||||
|
|
|
@ -2752,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
|
|||
LLT S32 = LLT::scalar(32);
|
||||
LLT S64 = LLT::scalar(64);
|
||||
|
||||
if (legalizeFastUnsafeFDIV(MI, MRI, B))
|
||||
return true;
|
||||
|
||||
if (DstTy == S16)
|
||||
return legalizeFDIV16(MI, MRI, B);
|
||||
if (DstTy == S32)
|
||||
|
@ -3092,9 +3089,49 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
|
||||
MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const {
|
||||
Register Res = MI.getOperand(0).getReg();
|
||||
Register X = MI.getOperand(1).getReg();
|
||||
Register Y = MI.getOperand(2).getReg();
|
||||
uint16_t Flags = MI.getFlags();
|
||||
LLT ResTy = MRI.getType(Res);
|
||||
|
||||
const MachineFunction &MF = B.getMF();
|
||||
bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
|
||||
MI.getFlag(MachineInstr::FmAfn);
|
||||
|
||||
if (!AllowInaccurateRcp)
|
||||
return false;
|
||||
|
||||
auto NegY = B.buildFNeg(ResTy, Y);
|
||||
auto One = B.buildFConstant(ResTy, 1.0);
|
||||
|
||||
auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
|
||||
.addUse(Y)
|
||||
.setMIFlags(Flags);
|
||||
|
||||
auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
|
||||
R = B.buildFMA(ResTy, Tmp0, R, R);
|
||||
|
||||
auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
|
||||
R = B.buildFMA(ResTy, Tmp1, R, R);
|
||||
|
||||
auto Ret = B.buildFMul(ResTy, X, R);
|
||||
auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
|
||||
|
||||
B.buildFMA(Res, Tmp2, R, Ret);
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
|
||||
MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const {
|
||||
if (legalizeFastUnsafeFDIV(MI, MRI, B))
|
||||
return true;
|
||||
|
||||
Register Res = MI.getOperand(0).getReg();
|
||||
Register LHS = MI.getOperand(1).getReg();
|
||||
Register RHS = MI.getOperand(2).getReg();
|
||||
|
@ -3157,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable,
|
|||
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
|
||||
MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const {
|
||||
if (legalizeFastUnsafeFDIV(MI, MRI, B))
|
||||
return true;
|
||||
|
||||
Register Res = MI.getOperand(0).getReg();
|
||||
Register LHS = MI.getOperand(1).getReg();
|
||||
Register RHS = MI.getOperand(2).getReg();
|
||||
|
@ -3223,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
|
|||
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
|
||||
MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const {
|
||||
if (legalizeFastUnsafeFDIV64(MI, MRI, B))
|
||||
return true;
|
||||
|
||||
Register Res = MI.getOperand(0).getReg();
|
||||
Register LHS = MI.getOperand(1).getReg();
|
||||
Register RHS = MI.getOperand(2).getReg();
|
||||
|
|
|
@ -128,6 +128,8 @@ public:
|
|||
MachineIRBuilder &B) const;
|
||||
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const;
|
||||
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const;
|
||||
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const;
|
||||
|
||||
|
|
|
@ -8212,6 +8212,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
|
|||
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue X = Op.getOperand(0);
|
||||
SDValue Y = Op.getOperand(1);
|
||||
EVT VT = Op.getValueType();
|
||||
const SDNodeFlags Flags = Op->getFlags();
|
||||
|
||||
bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
|
||||
DAG.getTarget().Options.UnsafeFPMath;
|
||||
if (!AllowInaccurateDiv)
|
||||
return SDValue();
|
||||
|
||||
SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
|
||||
SDValue One = DAG.getConstantFP(1.0, SL, VT);
|
||||
|
||||
SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
|
||||
SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
|
||||
|
||||
R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
|
||||
SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
|
||||
R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
|
||||
SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
|
||||
SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
|
||||
return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
|
||||
}
|
||||
|
||||
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
|
||||
EVT VT, SDValue A, SDValue B, SDValue GlueChain,
|
||||
SDNodeFlags Flags) {
|
||||
|
@ -8440,8 +8467,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
|
|||
}
|
||||
|
||||
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (DAG.getTarget().Options.UnsafeFPMath)
|
||||
return lowerFastUnsafeFDIV(Op, DAG);
|
||||
if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
|
||||
return FastLowered;
|
||||
|
||||
SDLoc SL(Op);
|
||||
SDValue X = Op.getOperand(0);
|
||||
|
|
|
@ -92,6 +92,7 @@ private:
|
|||
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFastUnsafeFDIV64(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
|
@ -786,7 +786,6 @@ def : Pat <
|
|||
|
||||
let OtherPredicates = [UnsafeFPMath] in {
|
||||
|
||||
//def : RcpPat<V_RCP_F64_e32, f64>;
|
||||
//defm : RsqPat<V_RSQ_F64_e32, f64>;
|
||||
//defm : RsqPat<V_RSQ_F32_e32, f32>;
|
||||
|
||||
|
|
|
@ -67,8 +67,14 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
|
|||
; GCN-LABEL: v_fdiv_f64_afn:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3]
|
||||
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv afn double %a, %b
|
||||
ret double %fdiv
|
||||
|
@ -245,7 +251,14 @@ define double @v_rcp_f64_arcp_afn(double %x) {
|
|||
; GCN-LABEL: v_rcp_f64_arcp_afn:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[0:1], v[0:1]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
|
||||
; GCN-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv arcp afn double 1.0, %x
|
||||
ret double %fdiv
|
||||
|
@ -311,8 +324,14 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
|
|||
; GCN-LABEL: v_fdiv_f64_afn_ulp25:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3]
|
||||
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv afn double %a, %b, !fpmath !0
|
||||
ret double %fdiv
|
||||
|
@ -471,10 +490,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
|
|||
; GCN-LABEL: v_fdiv_v2f64_afn:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7]
|
||||
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
|
||||
; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
|
||||
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
|
||||
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
|
||||
; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
|
||||
; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv afn <2 x double> %a, %b
|
||||
ret <2 x double> %fdiv
|
||||
|
@ -766,8 +797,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
|
|||
; GCN-LABEL: v_rcp_v2f64_arcp_afn:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[0:1], v[0:1]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
|
||||
; GCN-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5]
|
||||
; GCN-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv arcp afn <2 x double> <double 1.0, double 1.0>, %x
|
||||
ret <2 x double> %fdiv
|
||||
|
@ -871,10 +916,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
|
|||
; GCN-LABEL: v_fdiv_v2f64_afn_ulp25:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7]
|
||||
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
|
||||
; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
|
||||
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
|
||||
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
|
||||
; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
|
||||
; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0
|
||||
ret <2 x double> %fdiv
|
||||
|
@ -978,10 +1035,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
|
|||
; GCN-LABEL: v_fdiv_v2f64_arcp_afn_ulp25:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7]
|
||||
; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
|
||||
; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
|
||||
; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
|
||||
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
|
||||
; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
|
||||
; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
|
||||
; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
|
||||
; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
|
||||
; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
|
||||
; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
|
||||
; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0
|
||||
ret <2 x double> %fdiv
|
||||
|
|
|
@ -410,10 +410,16 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
|
|||
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
|
||||
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
|
||||
; CI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
|
||||
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
|
||||
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
|
@ -429,10 +435,16 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
|
|||
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
|
||||
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
|
||||
; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
|
||||
; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
|
||||
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
|
||||
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
|
||||
; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
|
@ -455,10 +467,16 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
|
|||
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
|
||||
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
|
||||
; CI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
|
||||
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
|
||||
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
|
@ -474,10 +492,16 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
|
|||
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
|
||||
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
|
||||
; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
|
||||
; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
|
||||
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
|
||||
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
|
||||
; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
|
|
|
@ -467,9 +467,17 @@ body: |
|
|||
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64
|
||||
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
|
||||
; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
|
||||
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
|
||||
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
|
||||
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s64)
|
||||
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[INT]]
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMUL]](s64)
|
||||
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
|
||||
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
|
||||
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMA3]]
|
||||
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]]
|
||||
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
|
||||
; GFX10-LABEL: name: test_fdiv_s64
|
||||
; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
|
||||
|
@ -1140,11 +1148,26 @@ body: |
|
|||
; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
|
||||
; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
|
||||
; GFX9-UNSAFE: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
|
||||
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[UV2]]
|
||||
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
|
||||
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s64)
|
||||
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[INT]]
|
||||
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
|
||||
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
|
||||
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[FMA3]]
|
||||
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[UV]]
|
||||
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
|
||||
; GFX9-UNSAFE: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV3]]
|
||||
; GFX9-UNSAFE: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s64)
|
||||
; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[INT1]]
|
||||
; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMUL]](s64), [[FMUL1]](s64)
|
||||
; GFX9-UNSAFE: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[INT1]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FMA6]], [[INT1]], [[INT1]]
|
||||
; GFX9-UNSAFE: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA7]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMA8]], [[FMA7]], [[FMA7]]
|
||||
; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[FMA9]]
|
||||
; GFX9-UNSAFE: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMUL1]], [[UV1]]
|
||||
; GFX9-UNSAFE: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]]
|
||||
; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64)
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
|
||||
; GFX10-LABEL: name: test_fdiv_v2s64
|
||||
; GFX10: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
|
||||
|
@ -2312,9 +2335,18 @@ body: |
|
|||
; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
|
||||
; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64)
|
||||
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp
|
||||
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
|
||||
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
|
||||
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
|
||||
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64)
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64)
|
||||
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
|
||||
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
|
||||
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]]
|
||||
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
|
||||
; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp
|
||||
; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
|
||||
; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
|
||||
|
@ -2409,10 +2441,19 @@ body: |
|
|||
; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
|
||||
; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64)
|
||||
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
|
||||
; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00
|
||||
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
|
||||
; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
|
||||
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s64)
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64)
|
||||
; GFX9-UNSAFE: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
|
||||
; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64)
|
||||
; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C1]]
|
||||
; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]]
|
||||
; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C1]]
|
||||
; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]]
|
||||
; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]]
|
||||
; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
|
||||
; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
|
||||
; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
|
||||
; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
|
||||
; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00
|
||||
; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
|
||||
|
|
|
@ -38,6 +38,35 @@ define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_fdiv_f64_afn:
|
||||
; GCN: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; GCN: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; GCN: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
|
||||
; GCN: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
|
||||
; GCN: s_setpc_b64
|
||||
define double @v_fdiv_f64_afn(double %x, double %y) #0 {
|
||||
%result = fdiv afn double %x, %y
|
||||
ret double %result
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_rcp_f64_afn:
|
||||
; GCN: v_rcp_f64_e32 v[2:3], v[0:1]
|
||||
; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
|
||||
; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
|
||||
; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
|
||||
; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
|
||||
; GCN: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
|
||||
; GCN: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
|
||||
; GCN: s_setpc_b64
|
||||
define double @v_rcp_f64_afn(double %x) #0 {
|
||||
%result = fdiv afn double 1.0, %x
|
||||
ret double %result
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fdiv_f64_s_v:
|
||||
define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
|
||||
%den = load double, double addrspace(1)* %in
|
||||
|
|
|
@ -711,41 +711,34 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
|
|||
define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
; SI-LABEL: fast_frem_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s15, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s14, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s8
|
||||
; SI-NEXT: s_mov_b32 s5, s9
|
||||
; SI-NEXT: s_mov_b32 s0, s10
|
||||
; SI-NEXT: s_mov_b32 s1, s11
|
||||
; SI-NEXT: s_mov_b32 s2, s6
|
||||
; SI-NEXT: s_mov_b32 s3, s7
|
||||
; SI-NEXT: s_mov_b32 s14, s6
|
||||
; SI-NEXT: s_mov_b32 s15, s7
|
||||
; SI-NEXT: s_mov_b32 s12, s4
|
||||
; SI-NEXT: s_mov_b32 s13, s5
|
||||
; SI-NEXT: s_mov_b32 s0, s6
|
||||
; SI-NEXT: s_mov_b32 s1, s7
|
||||
; SI-NEXT: s_mov_b32 s2, s14
|
||||
; SI-NEXT: s_mov_b32 s3, s15
|
||||
; SI-NEXT: s_mov_b32 s10, s14
|
||||
; SI-NEXT: s_mov_b32 s11, s15
|
||||
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
|
||||
; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
|
||||
; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
|
||||
; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
|
||||
; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
|
||||
; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
|
||||
; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
|
||||
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9
|
||||
; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; SI-NEXT: s_nop 1
|
||||
; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
|
||||
; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
|
||||
; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
|
||||
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
|
||||
; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
|
||||
; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
|
||||
; SI-NEXT: s_mov_b32 s1, 0xfffff
|
||||
; SI-NEXT: s_mov_b32 s0, s6
|
||||
; SI-NEXT: s_mov_b32 s0, s14
|
||||
; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8
|
||||
; SI-NEXT: v_not_b32_e32 v6, v6
|
||||
; SI-NEXT: v_and_b32_e32 v6, v4, v6
|
||||
|
@ -759,7 +752,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
|
|||
; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
|
||||
; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1]
|
||||
; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; CI-LABEL: fast_frem_f64:
|
||||
|
@ -780,18 +773,14 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
|
|||
; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
|
||||
; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
|
||||
; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
|
||||
; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
|
||||
; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
|
||||
; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
|
||||
; CI-NEXT: s_nop 1
|
||||
; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
|
||||
; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
|
||||
; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
|
||||
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
|
||||
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
|
@ -811,18 +800,14 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
|
|||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
|
||||
; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
|
||||
; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
|
||||
; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
|
||||
; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
|
||||
; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
|
||||
; VI-NEXT: s_nop 1
|
||||
; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
|
||||
; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
|
||||
; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
|
||||
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
|
||||
; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
|
||||
; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
|
||||
; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
|
@ -855,7 +840,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
|
|||
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; SI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
|
||||
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
|
||||
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
|
||||
; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
|
||||
; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
|
||||
; SI-NEXT: s_mov_b32 s1, 0xfffff
|
||||
|
@ -895,7 +886,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
|
|||
; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
|
||||
; CI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
|
||||
; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
|
||||
; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
|
||||
; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
|
||||
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
|
||||
; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
|
||||
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
|
@ -916,7 +913,13 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
|
|||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
|
||||
; VI-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
|
||||
; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
|
||||
; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
|
||||
; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
|
||||
; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
|
||||
; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
|
|
|
@ -107,9 +107,13 @@ define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}unsafe_rcp_pat_f64:
|
||||
; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
; SI-NOT: [[RESULT]]
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
; SI: v_rcp_f64
|
||||
; SI: v_fma_f64
|
||||
; SI: v_fma_f64
|
||||
; SI: v_fma_f64
|
||||
; SI: v_fma_f64
|
||||
; SI: v_fma_f64
|
||||
; SI: v_fma_f64
|
||||
define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
|
||||
%rcp = fdiv double 1.0, %src
|
||||
store double %rcp, double addrspace(1)* %out, align 8
|
||||
|
|
|
@ -95,9 +95,15 @@ define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float a
|
|||
; SI-SAFE: v_sqrt_f64_e32
|
||||
; SI-SAFE: v_div_scale_f64
|
||||
|
||||
; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
|
||||
; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
|
||||
; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
|
||||
; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
|
||||
; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]]
|
||||
; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[VAL]]
|
||||
; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 {
|
||||
%val = load double, double addrspace(1)* %in, align 4
|
||||
%sqrt = call double @llvm.sqrt.f64(double %val)
|
||||
|
@ -127,9 +133,16 @@ define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, flo
|
|||
; SI-SAFE: v_sqrt_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
|
||||
; SI-SAFE: v_div_scale_f64
|
||||
|
||||
; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
|
||||
; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
|
||||
; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
|
||||
; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
|
||||
; SI-UNSAFE-DAG: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -[[VAL]]
|
||||
; SI-UNSAFE-DAG: v_xor_b32_e32 v[[HI:[0-9]+]], 0x80000000, v{{[0-9]+}}
|
||||
; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI]]{{\]}}
|
||||
; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
; SI-UNSAFE: v_fma_f64
|
||||
define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 {
|
||||
%val = load double, double addrspace(1)* %in, align 4
|
||||
%val.fneg = fsub double -0.0, %val
|
||||
|
|
Loading…
Reference in New Issue