From e9fa3b8e6bb9a8d6d6674058c18b264cce83b026 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 15 Jul 2014 20:18:31 +0000 Subject: [PATCH] R600/SI: Implement less wrong f32 fdiv Assuming single precision denormals and accurate sqrt/div are not reported, this passes the OpenCL conformance test. llvm-svn: 213089 --- llvm/lib/Target/R600/SIISelLowering.cpp | 76 ++++++++++++++++++++++ llvm/lib/Target/R600/SIISelLowering.h | 3 + llvm/lib/Target/R600/SIInstructions.td | 11 ++-- llvm/test/CodeGen/R600/fdiv.ll | 77 ++++++++++++++--------- llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll | 17 ++++- llvm/test/CodeGen/R600/rsq.ll | 6 +- 6 files changed, 148 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index a7db2a9a3d94..56e760cf517d 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -221,6 +221,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FNEG, MVT::f64, Expand); setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -633,6 +635,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { @@ -930,6 +933,79 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } +static SDValue performUnsafeFDIV(SDValue Op, SelectionDAG &DAG) { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + + if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { + if (CLHS->isExactlyValue(1.0)) { + + // 1.0 / sqrt(x) -> rsq(x) + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + // Turn into multiply by the reciprocal + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getTarget().Options.UnsafeFPMath) + return performUnsafeFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); + + const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast(Op); diff --git a/llvm/lib/Target/R600/SIISelLowering.h b/llvm/lib/Target/R600/SIISelLowering.h index e25323ae87d5..aa09d2c9f8e9 100644 --- a/llvm/lib/Target/R600/SIISelLowering.h +++ b/llvm/lib/Target/R600/SIISelLowering.h @@ -27,6 +27,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index e72203321a55..1d2dd2f63a65 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -1800,11 +1800,13 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -def : RcpPat; def : RcpPat; -defm : RsqPat; defm : RsqPat; +let Predicates = [UnsafeFPMath] in { +defm : RsqPat; +} + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// @@ -2336,11 +2338,6 @@ def : Pat < (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; -def : Pat< - (fdiv f32:$src0, f32:$src1), - (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) ->; - def : Pat< (fdiv f64:$src0, f64:$src1), (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) diff --git a/llvm/test/CodeGen/R600/fdiv.ll b/llvm/test/CodeGen/R600/fdiv.ll index 3d21524de0f4..20db65c5eb60 100644 --- a/llvm/test/CodeGen/R600/fdiv.ll +++ b/llvm/test/CodeGen/R600/fdiv.ll @@ -1,20 +1,37 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; These tests check that fdiv is expanded correctly and also test that the ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate ; instruction groups. -; R600-CHECK: @fdiv_v2f32 -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; SI-CHECK: @fdiv_v2f32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 +; FUNC-LABEL: @fdiv_f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fdiv float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + + + +; FUNC-LABEL: @fdiv_v2f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { entry: %0 = fdiv <2 x float> %a, %b @@ -22,24 +39,24 @@ entry: ret void } -; R600-CHECK: @fdiv_v4f32 -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; SI-CHECK: @fdiv_v4f32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 +; FUNC-LABEL: @fdiv_v4f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float> addrspace(1) * %in diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll index ca5260dc5bc8..42910a99afde 100644 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone @@ -24,7 +25,15 @@ define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { } ; FUNC-LABEL: @rcp_pat_f32 -; SI: V_RCP_F32_e32 +; SI-UNSAFE-NOT: V_MUL_F32 +; SI-UNSAFE: V_RCP_F32_e32 +; SI-UNSAFE-NOT: V_MUL_F32 + +; Check for surrounding multiplies the correct divide has. +; SI-SAFE: V_MUL_F32 +; SI-SAFE: V_RCP_F32_e32 +; SI-SAFE: V_MUL_F32 + define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 @@ -40,7 +49,9 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { } ; FUNC-LABEL: @rsq_rcp_pat_f32 -; SI: V_RSQ_F32_e32 +; SI-UNSAFE: V_RSQ_F32_e32 +; SI-SAFE: V_SQRT_F32_e32 +; SI-SAFE: V_RCP_F32_e32 define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone diff --git a/llvm/test/CodeGen/R600/rsq.ll b/llvm/test/CodeGen/R600/rsq.ll index 87c05701104f..67177e9f68ba 100644 --- a/llvm/test/CodeGen/R600/rsq.ll +++ b/llvm/test/CodeGen/R600/rsq.ll @@ -1,10 +1,12 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare float @llvm.sqrt.f32(float) nounwind readnone declare double @llvm.sqrt.f64(double) nounwind readnone ; SI-LABEL: @rsq_f32 -; SI: V_RSQ_F32_e32 +; SI-UNSAFE: V_RSQ_F32_e32 +; SI-SAFE: V_SQRT_F32 ; SI: S_ENDPGM define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float addrspace(1)* %in, align 4