llvm-project/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s

; GCN-LABEL: {{^}}rcp_uint:
; GCN: v_rcp_iflag_f32_e32
define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %cvt = uitofp i32 %load to float
  %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
  store float %div, float addrspace(1)* %out, align 4
  ret void
}

; GCN-LABEL: {{^}}rcp_sint:
; GCN: v_rcp_iflag_f32_e32
define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %cvt = sitofp i32 %load to float
  %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
  store float %div, float addrspace(1)* %out, align 4
  ret void
}

; GCN-LABEL: {{^}}rcp_uint_denorm:
; GCN-NOT: v_rcp_iflag_f32
define amdgpu_kernel void @rcp_uint_denorm(i32 addrspace(1)* %in, float addrspace(1)* %out) #1 {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %cvt = uitofp i32 %load to float
  %div = fdiv float 1.000000e+00, %cvt
  store float %div, float addrspace(1)* %out, align 4
  ret void
}

; GCN-LABEL: {{^}}rcp_sint_denorm:
; GCN-NOT: v_rcp_iflag_f32
define amdgpu_kernel void @rcp_sint_denorm(i32 addrspace(1)* %in, float addrspace(1)* %out) #1 {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %cvt = sitofp i32 %load to float
  %div = fdiv float 1.000000e+00, %cvt
  store float %div, float addrspace(1)* %out, align 4
  ret void
}

!0 = !{float 2.500000e+00}

attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-math-f32"="ieee,ieee" }
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck --check-prefix=GCN %s`
[AMDGPU] Convert rcp to rcp_iflag If a source of rcp instruction is a result of any conversion from an integer convert it into rcp_iflag instruction. No FP exception can ever happen except division by zero if a single precision rcp argument is a representation of an integral number. Differential Revision: https://reviews.llvm.org/D48569 llvm-svn: 335742 2018-06-27 23:33:33 +08:00
			`; GCN-LABEL: {{^}}rcp_uint:`
			`; GCN: v_rcp_iflag_f32_e32`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 {`
[AMDGPU] Convert rcp to rcp_iflag If a source of rcp instruction is a result of any conversion from an integer convert it into rcp_iflag instruction. No FP exception can ever happen except division by zero if a single precision rcp argument is a representation of an integral number. Differential Revision: https://reviews.llvm.org/D48569 llvm-svn: 335742 2018-06-27 23:33:33 +08:00			`%load = load i32, i32 addrspace(1)* %in, align 4`
			`%cvt = uitofp i32 %load to float`
AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare Summary: RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not meet the requirement. However, in DAG lowering, fpmath information gets lost, and thus we may generate either inaccurate rcp related computation or slow code for fdiv. In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could exactly know !fpmath. FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on unsafe-fp-math, fast math flags, denormals and fpmath accuracy request. RCP Optimizations: 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with denormals flushed. a/b -> a*rcp(b) when fast unsafe rcp is legal. Use fdiv.fast: a/b -> fdiv.fast(a, b) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals flushed. 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals. Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D71293 2020-01-24 08:57:43 +08:00			`%div = fdiv float 1.000000e+00, %cvt, !fpmath !0`
[AMDGPU] Convert rcp to rcp_iflag If a source of rcp instruction is a result of any conversion from an integer convert it into rcp_iflag instruction. No FP exception can ever happen except division by zero if a single precision rcp argument is a representation of an integral number. Differential Revision: https://reviews.llvm.org/D48569 llvm-svn: 335742 2018-06-27 23:33:33 +08:00			`store float %div, float addrspace(1)* %out, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}rcp_sint:`
			`; GCN: v_rcp_iflag_f32_e32`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 {`
[AMDGPU] Convert rcp to rcp_iflag If a source of rcp instruction is a result of any conversion from an integer convert it into rcp_iflag instruction. No FP exception can ever happen except division by zero if a single precision rcp argument is a representation of an integral number. Differential Revision: https://reviews.llvm.org/D48569 llvm-svn: 335742 2018-06-27 23:33:33 +08:00			`%load = load i32, i32 addrspace(1)* %in, align 4`
			`%cvt = sitofp i32 %load to float`
AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare Summary: RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not meet the requirement. However, in DAG lowering, fpmath information gets lost, and thus we may generate either inaccurate rcp related computation or slow code for fdiv. In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could exactly know !fpmath. FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on unsafe-fp-math, fast math flags, denormals and fpmath accuracy request. RCP Optimizations: 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with denormals flushed. a/b -> a*rcp(b) when fast unsafe rcp is legal. Use fdiv.fast: a/b -> fdiv.fast(a, b) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals flushed. 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals. Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D71293 2020-01-24 08:57:43 +08:00			`%div = fdiv float 1.000000e+00, %cvt, !fpmath !0`
[AMDGPU] Convert rcp to rcp_iflag If a source of rcp instruction is a result of any conversion from an integer convert it into rcp_iflag instruction. No FP exception can ever happen except division by zero if a single precision rcp argument is a representation of an integral number. Differential Revision: https://reviews.llvm.org/D48569 llvm-svn: 335742 2018-06-27 23:33:33 +08:00			`store float %div, float addrspace(1)* %out, align 4`
			`ret void`
			`}`
AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare Summary: RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not meet the requirement. However, in DAG lowering, fpmath information gets lost, and thus we may generate either inaccurate rcp related computation or slow code for fdiv. In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could exactly know !fpmath. FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on unsafe-fp-math, fast math flags, denormals and fpmath accuracy request. RCP Optimizations: 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with denormals flushed. a/b -> a*rcp(b) when fast unsafe rcp is legal. Use fdiv.fast: a/b -> fdiv.fast(a, b) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals flushed. 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals. Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D71293 2020-01-24 08:57:43 +08:00
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`; GCN-LABEL: {{^}}rcp_uint_denorm:`
			`; GCN-NOT: v_rcp_iflag_f32`
			`define amdgpu_kernel void @rcp_uint_denorm(i32 addrspace(1)* %in, float addrspace(1)* %out) #1 {`
			`%load = load i32, i32 addrspace(1)* %in, align 4`
			`%cvt = uitofp i32 %load to float`
			`%div = fdiv float 1.000000e+00, %cvt`
			`store float %div, float addrspace(1)* %out, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}rcp_sint_denorm:`
			`; GCN-NOT: v_rcp_iflag_f32`
			`define amdgpu_kernel void @rcp_sint_denorm(i32 addrspace(1)* %in, float addrspace(1)* %out) #1 {`
			`%load = load i32, i32 addrspace(1)* %in, align 4`
			`%cvt = sitofp i32 %load to float`
			`%div = fdiv float 1.000000e+00, %cvt`
			`store float %div, float addrspace(1)* %out, align 4`
			`ret void`
			`}`

AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare Summary: RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not meet the requirement. However, in DAG lowering, fpmath information gets lost, and thus we may generate either inaccurate rcp related computation or slow code for fdiv. In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could exactly know !fpmath. FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on unsafe-fp-math, fast math flags, denormals and fpmath accuracy request. RCP Optimizations: 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with denormals flushed. a/b -> a*rcp(b) when fast unsafe rcp is legal. Use fdiv.fast: a/b -> fdiv.fast(a, b) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals flushed. 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and fpmath >= 2.5ULP with denormals. Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D71293 2020-01-24 08:57:43 +08:00			`!0 = !{float 2.500000e+00}`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00
			`attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }`
			`attributes #1 = { "denormal-fp-math-f32"="ieee,ieee" }`