forked from OSchip/llvm-project
AMDGPU/SI: Fix 32-bit fdiv lowering
We were using the fast fdiv lowering for all division, implementation of IEEE754 fdiv is added. http://reviews.llvm.org/D20557 llvm-svn: 272292
This commit is contained in:
parent
b6f0f521f5
commit
ed0f97fad2
|
@ -36,6 +36,12 @@
|
|||
|
||||
using namespace llvm;
|
||||
|
||||
// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
|
||||
static cl::opt<bool> EnableAMDGPUFastFDIV(
|
||||
"amdgpu-fast-fdiv",
|
||||
cl::desc("Enable faster 2.5 ulp fdiv"),
|
||||
cl::init(false));
|
||||
|
||||
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
|
||||
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
||||
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
|
||||
|
@ -1928,7 +1934,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
|
|||
}
|
||||
}
|
||||
|
||||
if (Unsafe) {
|
||||
const SDNodeFlags *Flags = Op->getFlags();
|
||||
|
||||
if (Unsafe || Flags->hasAllowReciprocal()) {
|
||||
// Turn into multiply by the reciprocal.
|
||||
// x / y -> x * (1.0 / y)
|
||||
SDNodeFlags Flags;
|
||||
|
@ -1953,32 +1961,61 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
|
|||
SDValue LHS = Op.getOperand(0);
|
||||
SDValue RHS = Op.getOperand(1);
|
||||
|
||||
SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
|
||||
// faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
|
||||
if (EnableAMDGPUFastFDIV) {
|
||||
SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
|
||||
|
||||
const APFloat K0Val(BitsToFloat(0x6f800000));
|
||||
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
|
||||
const APFloat K0Val(BitsToFloat(0x6f800000));
|
||||
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
|
||||
|
||||
const APFloat K1Val(BitsToFloat(0x2f800000));
|
||||
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
|
||||
const APFloat K1Val(BitsToFloat(0x2f800000));
|
||||
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
|
||||
|
||||
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
||||
|
||||
EVT SetCCVT =
|
||||
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
|
||||
|
||||
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
|
||||
|
||||
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
|
||||
|
||||
// TODO: Should this propagate fast-math-flags?
|
||||
|
||||
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
|
||||
|
||||
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
|
||||
|
||||
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
|
||||
|
||||
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
|
||||
}
|
||||
|
||||
// Generates more precise fpdiv32.
|
||||
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
||||
|
||||
EVT SetCCVT =
|
||||
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
|
||||
|
||||
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
|
||||
|
||||
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
|
||||
|
||||
// TODO: Should this propagate fast-math-flags?
|
||||
|
||||
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
|
||||
|
||||
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
|
||||
|
||||
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
|
||||
|
||||
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
|
||||
|
||||
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
|
||||
|
||||
SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
|
||||
SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
|
||||
|
||||
SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
|
||||
|
||||
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
|
||||
|
||||
SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
|
||||
SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
|
||||
|
||||
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
|
||||
|
||||
SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
|
||||
SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
|
||||
SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
|
||||
|
||||
SDValue Scale = NumeratorScaled.getValue(1);
|
||||
SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
|
||||
|
||||
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
|
|
@ -1,19 +1,33 @@
|
|||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
|
||||
|
||||
; These tests check that fdiv is expanded correctly and also test that the
|
||||
; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
|
||||
; instruction groups.
|
||||
|
||||
; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_f32:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
|
||||
; I754-DAG: v_div_scale_f32
|
||||
; I754-DAG: v_rcp_f32
|
||||
; I754-DAG: v_fma_f32
|
||||
; I754-DAG: v_mul_f32
|
||||
; I754-DAG: v_fma_f32
|
||||
; I754-DAG: v_div_fixup_f32
|
||||
define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
|
||||
entry:
|
||||
%0 = fdiv float %a, %b
|
||||
|
@ -21,7 +35,41 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
|
||||
entry:
|
||||
%0 = fdiv fast float %a, %b
|
||||
store float %0, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
|
||||
entry:
|
||||
%0 = fdiv arcp float %a, %b
|
||||
store float %0, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_v2f32:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
|
@ -29,10 +77,22 @@ entry:
|
|||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fdiv <2 x float> %a, %b
|
||||
|
@ -40,6 +100,50 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fdiv fast <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fdiv arcp <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_v4f32:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
@ -50,6 +154,15 @@ entry:
|
|||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
|
@ -58,6 +171,19 @@ entry:
|
|||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float>, <4 x float> addrspace(1) * %in
|
||||
|
@ -66,3 +192,75 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
|
|||
store <4 x float> %result, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float>, <4 x float> addrspace(1) * %in
|
||||
%b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
|
||||
%result = fdiv fast <4 x float> %a, %b
|
||||
store <4 x float> %result, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float>, <4 x float> addrspace(1) * %in
|
||||
%b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
|
||||
%result = fdiv arcp <4 x float> %a, %b
|
||||
store <4 x float> %result, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -5,11 +5,13 @@
|
|||
; FUNC-LABEL: {{^}}frem_f32:
|
||||
; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
|
||||
; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
|
||||
; GCN-DAG: v_cmp
|
||||
; GCN-DAG: v_mul_f32
|
||||
; GCN: v_div_scale_f32
|
||||
|
||||
; GCN: v_rcp_f32_e32
|
||||
; GCN: v_fma_f32
|
||||
; GCN: v_mul_f32_e32
|
||||
; GCN: v_mul_f32_e32
|
||||
; GCN: v_div_fmas_f32
|
||||
; GCN: v_div_fixup_f32
|
||||
; GCN: v_trunc_f32_e32
|
||||
; GCN: v_mad_f32
|
||||
; GCN: s_endpgm
|
||||
|
|
Loading…
Reference in New Issue