AMDGPU: Use rcp for fdiv 1, x with fpmath metadata

Using rcp should be OK for safe math usually, so this should not be replacing the original fdiv. llvm-svn: 276823
2016-07-26 23:25:44 +00:00 · 2016-07-26 23:25:44 +00:00 · e3862cdc93
parent 685e8ff953
commit e3862cdc93
4 changed files with 95 additions and 19 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -76,7 +76,7 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
    return false;

  // Reciprocal f32 is handled separately without denormals.
-  return UnsafeDiv && CNum->isExactlyValue(+1.0);
+  return UnsafeDiv || CNum->isExactlyValue(+1.0);
 }

 // Insert an intrinsic for fast fdiv for safe math situations where we can
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@ -45,6 +45,7 @@ define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {

 ; CHECK-LABEL: @rcp_fdiv_fpmath(
 ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
+; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
 ; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
@ -54,6 +55,9 @@ define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
  %no.md = fdiv float 1.0, %x
  store volatile float %no.md, float addrspace(1)* %out

+  %md.25ulp = fdiv float 1.0, %x, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
  store volatile float %md.half.ulp, float addrspace(1)* %out

@ -146,13 +150,13 @@ define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float>
 ; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
 ; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
+; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %arcp.25ulp

 ; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
 ; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
+; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
 define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@ -1,11 +1,96 @@
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s

-; FIXME: Evergreen only ever does unsafe fp math.
 ; FUNC-LABEL: {{^}}rcp_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
  %rcp = fdiv float 1.0, %src
  store float %rcp, float addrspace(1)* %out, align 4
  ret void
 }
+
+; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv fast float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv arcp float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+  %rcp = fdiv float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_fabs_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]|
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %src.fabs = call float @llvm.fabs.f32(float %src)
+  %rcp = fdiv float 1.0, %src.fabs
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: fneg folded into constant 1
+; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32:
+define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %src.fabs = call float @llvm.fabs.f32(float %src)
+  %src.fabs.fneg = fsub float -0.0, %src.fabs
+  %rcp = fdiv float 1.0, %src.fabs.fneg
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare float @llvm.fabs.f32(float) #1
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
+
+!0 = !{float 2.500000e+00}
--- a/llvm/test/CodeGen/AMDGPU/reciprocal.ll
+++ b/llvm/test/CodeGen/AMDGPU/reciprocal.ll
@ -1,13 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define amdgpu_ps void @test(<4 x float> inreg %reg0) {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = fdiv float 1.0, %r0
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)