llvm-project/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll

; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}fptosi_f16_to_i16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short v[[R_I16]]
; GCN: s_endpgm
define amdgpu_kernel void @fptosi_f16_to_i16(
    i16 addrspace(1)* %r,
    half addrspace(1)* %a) {
entry:
  %a.val = load half, half addrspace(1)* %a
  %r.val = fptosi half %a.val to i16
  store i16 %r.val, i16 addrspace(1)* %r
  ret void
}

; GCN-LABEL: {{^}}fptosi_f16_to_i32
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_dword v[[R_I32]]
; GCN: s_endpgm
define amdgpu_kernel void @fptosi_f16_to_i32(
    i32 addrspace(1)* %r,
    half addrspace(1)* %a) {
entry:
  %a.val = load half, half addrspace(1)* %a
  %r.val = fptosi half %a.val to i32
  store i32 %r.val, i32 addrspace(1)* %r
  ret void
}

; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
; test checks code generated for 'i64 = fp_to_sint f32'.

; GCN-LABEL: {{^}}fptosi_f16_to_i64
; GCN: buffer_load_ushort
; GCN: v_cvt_f32_f16_e32
; GCN: s_endpgm
define amdgpu_kernel void @fptosi_f16_to_i64(
    i64 addrspace(1)* %r,
    half addrspace(1)* %a) {
entry:
  %a.val = load half, half addrspace(1)* %a
  %r.val = fptosi half %a.val to i64
  store i64 %r.val, i64 addrspace(1)* %r
  ret void
}

; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]

; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]]

; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

; GCN: buffer_store_dword v[[R_V2_I16]]
; GCN: s_endpgm

define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
    <2 x i16> addrspace(1)* %r,
    <2 x half> addrspace(1)* %a) {
entry:
  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
  %r.val = fptosi <2 x half> %a.val to <2 x i16>
  store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r
  ret void
}

; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32
; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa
; GCN: v_cvt_i32_f32_e32
; GCN: v_cvt_i32_f32_e32
; GCN: buffer_store_dwordx2
; GCN: s_endpgm
define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
    <2 x i32> addrspace(1)* %r,
    <2 x half> addrspace(1)* %a) {
entry:
  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
  %r.val = fptosi <2 x half> %a.val to <2 x i32>
  store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
  ret void
}

; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
; test checks code generated for 'i64 = fp_to_sint f32'.

; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa
; GCN: s_endpgm
define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
    <2 x i64> addrspace(1)* %r,
    <2 x half> addrspace(1)* %a) {
entry:
  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
  %r.val = fptosi <2 x half> %a.val to <2 x i64>
  store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r
  ret void
}
[AMDGPU] Switch scalarize global loads ON by default Differential revision: https://reviews.llvm.org/D34407 llvm-svn: 307097 2017-07-05 01:32:00 +08:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s`
			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00
			`; GCN-LABEL: {{^}}fptosi_f16_to_i16`
			`; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]`
[AMDGPU] Promote f16/i16 conversions to f32/i32 llvm-svn: 287201 2016-11-17 12:00:46 +08:00			`; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]`
			`; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: buffer_store_short v[[R_I16]]`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fptosi_f16_to_i16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`i16 addrspace(1)* %r,`
			`half addrspace(1)* %a) {`
			`entry:`
			`%a.val = load half, half addrspace(1)* %a`
			`%r.val = fptosi half %a.val to i16`
			`store i16 %r.val, i16 addrspace(1)* %r`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}fptosi_f16_to_i32`
			`; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]`
			`; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]`
			`; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]`
			`; GCN: buffer_store_dword v[[R_I32]]`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fptosi_f16_to_i32(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`i32 addrspace(1)* %r,`
			`half addrspace(1)* %a) {`
			`entry:`
			`%a.val = load half, half addrspace(1)* %a`
			`%r.val = fptosi half %a.val to i32`
			`store i32 %r.val, i32 addrspace(1)* %r`
			`ret void`
			`}`

			`; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing`
			`; test checks code generated for 'i64 = fp_to_sint f32'.`

			`; GCN-LABEL: {{^}}fptosi_f16_to_i64`
			`; GCN: buffer_load_ushort`
			`; GCN: v_cvt_f32_f16_e32`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fptosi_f16_to_i64(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`i64 addrspace(1)* %r,`
			`half addrspace(1)* %a) {`
			`entry:`
			`%a.val = load half, half addrspace(1)* %a`
			`%r.val = fptosi half %a.val to i64`
			`store i64 %r.val, i64 addrspace(1)* %r`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16`
			`; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00
			`; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]`
			`; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]`
			`; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]`
			`; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]`
AMDGPU: Cleanup subtarget features Try to avoid mutually exclusive features. Don't use a real default GPU, and use a fake "generic". The goal is to make it easier to see which set of features are incompatible between feature strings. Most of the test changes are due to random scheduling changes from not having a default fullspeed model. llvm-svn: 310258 2017-08-07 22:58:04 +08:00			`; SI-DAG: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]`
			`; SI-DAG: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00			`; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]`
AMDGPU: Allow SIShrinkInstructions to work in non-SSA Immediates can be folded as long as the immediate is a vreg. Also undo commuting instructions if it didn't fold an immediate. llvm-svn: 307575 2017-07-11 03:53:57 +08:00			`; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]]`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00
			`; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]`
			`; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1`
			`; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]`
			`; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD`
[AMDGPU] Untangle SDWA pass from SIShrinkInstructions Remove dependency of SDWA pass on SIShrinkInstructions. The goal is to move SDWA even higher in the stack to avoid second run of MachineLICM, MachineCSE and SIFoldOperands. Also added handling to preserve original src modifiers. Differential Revision: https://reviews.llvm.org/D33860 llvm-svn: 304665 2017-06-04 01:39:47 +08:00			`; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: buffer_store_dword v[[R_V2_I16]]`
			`; GCN: s_endpgm`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fptosi_v2f16_to_v2i16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`<2 x i16> addrspace(1)* %r,`
			`<2 x half> addrspace(1)* %a) {`
			`entry:`
			`%a.val = load <2 x half>, <2 x half> addrspace(1)* %a`
			`%r.val = fptosi <2 x half> %a.val to <2 x i16>`
			`store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32`
[AMDGPU] Promote f16/i16 conversions to f32/i32 llvm-svn: 287201 2016-11-17 12:00:46 +08:00			`; GCN: buffer_load_dword`
			`; GCN: v_cvt_f32_f16_e32`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00			`; SI: v_cvt_f32_f16_e32`
			`; VI: v_cvt_f32_f16_sdwa`
[AMDGPU] Promote f16/i16 conversions to f32/i32 llvm-svn: 287201 2016-11-17 12:00:46 +08:00			`; GCN: v_cvt_i32_f32_e32`
			`; GCN: v_cvt_i32_f32_e32`
			`; GCN: buffer_store_dwordx2`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fptosi_v2f16_to_v2i32(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`<2 x i32> addrspace(1)* %r,`
			`<2 x half> addrspace(1)* %a) {`
			`entry:`
			`%a.val = load <2 x half>, <2 x half> addrspace(1)* %a`
			`%r.val = fptosi <2 x half> %a.val to <2 x i32>`
			`store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r`
			`ret void`
			`}`

			`; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing`
			`; test checks code generated for 'i64 = fp_to_sint f32'.`

			`; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64`
			`; GCN: buffer_load_dword`
			`; GCN: v_cvt_f32_f16_e32`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00			`; SI: v_cvt_f32_f16_e32`
			`; VI: v_cvt_f32_f16_sdwa`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fptosi_v2f16_to_v2i64(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`<2 x i64> addrspace(1)* %r,`
			`<2 x half> addrspace(1)* %a) {`
			`entry:`
			`%a.val = load <2 x half>, <2 x half> addrspace(1)* %a`
			`%r.val = fptosi <2 x half> %a.val to <2 x i64>`
			`store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r`
			`ret void`
			`}`