llvm-project/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}uitofp_i16_to_f16
; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
; SI:  v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; VI:  v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @uitofp_i16_to_f16(
    half addrspace(1)* %r,
    i16 addrspace(1)* %a) {
entry:
  %a.val = load i16, i16 addrspace(1)* %a
  %r.val = uitofp i16 %a.val to half
  store half %r.val, half addrspace(1)* %r
  ret void
}

; GCN-LABEL: {{^}}uitofp_i32_to_f16
; GCN: buffer_load_dword v[[A_I32:[0-9]+]]
; GCN: v_cvt_f32_u32_e32 v[[A_I16:[0-9]+]], v[[A_I32]]
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @uitofp_i32_to_f16(
    half addrspace(1)* %r,
    i32 addrspace(1)* %a) {
entry:
  %a.val = load i32, i32 addrspace(1)* %a
  %r.val = uitofp i32 %a.val to half
  store half %r.val, half addrspace(1)* %r
  ret void
}

; f16 = uitofp i64 is in uint_to_fp.i64.ll

; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16
; GCN:     buffer_load_dword

; SI: v_cvt_f32_u32_e32
; SI: v_cvt_f32_u32_e32
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; SI-DAG: v_lshlrev_b32_e32
; SI: v_or_b32_e32

; VI-DAG: v_cvt_f16_f32_e32
; VI-DAG: v_cvt_f32_i32_sdwa
; VI-DAG: v_cvt_f32_i32_sdwa
; VI-DAG: v_cvt_f16_f32_sdwa
; VI:     v_or_b32_e32

; GCN: buffer_store_dword
; GCN: s_endpgm
define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
    <2 x half> addrspace(1)* %r,
    <2 x i16> addrspace(1)* %a) {
entry:
  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
  %r.val = uitofp <2 x i16> %a.val to <2 x half>
  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
  ret void
}

; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16
; GCN:     buffer_load_dwordx2

; SI: v_cvt_f32_u32_e32
; SI: v_cvt_f32_u32_e32
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; SI-DAG: v_lshlrev_b32_e32
; SI: v_or_b32_e32

; VI-DAG: v_cvt_f32_u32_e32
; VI-DAG: v_cvt_f32_u32_e32
; VI-DAG: v_cvt_f16_f32_e32
; VI-DAG: v_cvt_f16_f32_sdwa
; VI:     v_or_b32_e32

; GCN:     buffer_store_dword
; GCN:     s_endpgm
define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
    <2 x half> addrspace(1)* %r,
    <2 x i32> addrspace(1)* %a) {
entry:
  %a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a
  %r.val = uitofp <2 x i32> %a.val to <2 x half>
  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
  ret void
}

; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f16:
; GCN-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}
; GCN-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
; GCN: s_xor_b64 [[R_CMP:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP0]]
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[R_CMP]]
; GCN-NEXT: v_cvt_f16_f32_e32 [[R_F16:v[0-9]+]], [[RESULT]]
; GCN: buffer_store_short
; GCN: s_endpgm
define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(half addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
  %a = load float, float addrspace(1) * %in0
  %b = load float, float addrspace(1) * %in1
  %acmp = fcmp oge float %a, 0.000000e+00
  %bcmp = fcmp oge float %b, 1.000000e+00
  %result = xor i1 %acmp, %bcmp
  %fp = uitofp i1 %result to half
  store half %fp, half addrspace(1)* %out
  ret void
}

; f16 = uitofp i64 is in uint_to_fp.i64.ll
AMDGPU: Cleanup subtarget features Try to avoid mutually exclusive features. Don't use a real default GPU, and use a fake "generic". The goal is to make it easier to see which set of features are incompatible between feature strings. Most of the test changes are due to random scheduling changes from not having a default fullspeed model. llvm-svn: 310258 2017-08-07 22:58:04 +08:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s`
			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00
			`; GCN-LABEL: {{^}}uitofp_i16_to_f16`
			`; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]`
			`; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]`
[AMDGPU] Promote f16/i16 conversions to f32/i32 llvm-svn: 287201 2016-11-17 12:00:46 +08:00			`; VI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]`
			`; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: buffer_store_short v[[R_F16]]`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @uitofp_i16_to_f16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`half addrspace(1)* %r,`
			`i16 addrspace(1)* %a) {`
			`entry:`
			`%a.val = load i16, i16 addrspace(1)* %a`
			`%r.val = uitofp i16 %a.val to half`
			`store half %r.val, half addrspace(1)* %r`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}uitofp_i32_to_f16`
			`; GCN: buffer_load_dword v[[A_I32:[0-9]+]]`
			`; GCN: v_cvt_f32_u32_e32 v[[A_I16:[0-9]+]], v[[A_I32]]`
			`; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]`
			`; GCN: buffer_store_short v[[R_F16]]`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @uitofp_i32_to_f16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`half addrspace(1)* %r,`
			`i32 addrspace(1)* %a) {`
			`entry:`
			`%a.val = load i32, i32 addrspace(1)* %a`
			`%r.val = uitofp i32 %a.val to half`
			`store half %r.val, half addrspace(1)* %r`
			`ret void`
			`}`

			`; f16 = uitofp i64 is in uint_to_fp.i64.ll`

			`; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16`
[AMDGPU] Promote f16/i16 conversions to f32/i32 llvm-svn: 287201 2016-11-17 12:00:46 +08:00			`; GCN: buffer_load_dword`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00
			`; SI: v_cvt_f32_u32_e32`
			`; SI: v_cvt_f32_u32_e32`
			`; SI: v_cvt_f16_f32_e32`
			`; SI: v_cvt_f16_f32_e32`
			`; SI-DAG: v_lshlrev_b32_e32`
			`; SI: v_or_b32_e32`

			`; VI-DAG: v_cvt_f16_f32_e32`
			`; VI-DAG: v_cvt_f32_i32_sdwa`
			`; VI-DAG: v_cvt_f32_i32_sdwa`
			`; VI-DAG: v_cvt_f16_f32_sdwa`
			`; VI: v_or_b32_e32`

			`; GCN: buffer_store_dword`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @uitofp_v2i16_to_v2f16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`<2 x half> addrspace(1)* %r,`
			`<2 x i16> addrspace(1)* %a) {`
			`entry:`
			`%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a`
			`%r.val = uitofp <2 x i16> %a.val to <2 x half>`
			`store <2 x half> %r.val, <2 x half> addrspace(1)* %r`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16`
			`; GCN: buffer_load_dwordx2`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00
			`; SI: v_cvt_f32_u32_e32`
			`; SI: v_cvt_f32_u32_e32`
			`; SI: v_cvt_f16_f32_e32`
			`; SI: v_cvt_f16_f32_e32`
			`; SI-DAG: v_lshlrev_b32_e32`
			`; SI: v_or_b32_e32`

			`; VI-DAG: v_cvt_f32_u32_e32`
			`; VI-DAG: v_cvt_f32_u32_e32`
			`; VI-DAG: v_cvt_f16_f32_e32`
			`; VI-DAG: v_cvt_f16_f32_sdwa`
			`; VI: v_or_b32_e32`

[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: buffer_store_dword`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @uitofp_v2i32_to_v2f16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`<2 x half> addrspace(1)* %r,`
			`<2 x i32> addrspace(1)* %a) {`
			`entry:`
			`%a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a`
			`%r.val = uitofp <2 x i32> %a.val to <2 x half>`
			`store <2 x half> %r.val, <2 x half> addrspace(1)* %r`
			`ret void`
			`}`

[AMDGPU] Add instruction selection for i1 to f16 conversion Summary: This is required for GPUs with 16 bit instructions where f16 is a legal register type and hence int_to_fp i1 to f16 is not lowered by legalizing. Reviewers: arsenm, nhaehnle Reviewed By: nhaehnle Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D52018 Change-Id: Ie4c0fd6ced7cf10ad612023c6879724d9ded5851 llvm-svn: 342558 2018-09-20 00:32:12 +08:00			`; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f16:`
			`; GCN-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}`
			`; GCN-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}`
			`; GCN: s_xor_b64 [[R_CMP:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP0]]`
			`; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[R_CMP]]`
			`; GCN-NEXT: v_cvt_f16_f32_e32 [[R_F16:v[0-9]+]], [[RESULT]]`
			`; GCN: buffer_store_short`
			`; GCN: s_endpgm`
			`define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(half addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {`
			`%a = load float, float addrspace(1) * %in0`
			`%b = load float, float addrspace(1) * %in1`
			`%acmp = fcmp oge float %a, 0.000000e+00`
			`%bcmp = fcmp oge float %b, 1.000000e+00`
			`%result = xor i1 %acmp, %bcmp`
			`%fp = uitofp i1 %result to half`
			`store half %fp, half addrspace(1)* %out`
			`ret void`
			`}`

[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; f16 = uitofp i64 is in uint_to_fp.i64.ll`