llvm-project/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s

declare half @llvm.rint.f16(half %a)
declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)

; GCN-LABEL: {{^}}rint_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI:  v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]]
; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
; GFX89: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @rint_f16(
    half addrspace(1)* %r,
    half addrspace(1)* %a) {
entry:
  %a.val = load half, half addrspace(1)* %a
  %r.val = call half @llvm.rint.f16(half %a.val)
  store half %r.val, half addrspace(1)* %r
  ret void
}

; GCN-LABEL: {{^}}rint_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]

; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]

; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm

define amdgpu_kernel void @rint_v2f16(
    <2 x half> addrspace(1)* %r,
    <2 x half> addrspace(1)* %a) {
entry:
  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
  %r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val)
  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
  ret void
}
AMDGPU: Cleanup subtarget features Try to avoid mutually exclusive features. Don't use a real default GPU, and use a fake "generic". The goal is to make it easier to see which set of features are incompatible between feature strings. Most of the test changes are due to random scheduling changes from not having a default fullspeed model. llvm-svn: 310258 2017-08-07 22:58:04 +08:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s`
			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s`
			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00
			`declare half @llvm.rint.f16(half %a)`
			`declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)`

			`; GCN-LABEL: {{^}}rint_f16`
			`; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]`
			`; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]`
			`; SI: v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]]`
			`; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]`
AMDGPU: Implement f16 fround llvm-svn: 298730 2017-03-25 04:04:18 +08:00			`; GFX89: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: buffer_store_short v[[R_F16]]`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @rint_f16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`half addrspace(1)* %r,`
			`half addrspace(1)* %a) {`
			`entry:`
			`%a.val = load half, half addrspace(1)* %a`
			`%r.val = call half @llvm.rint.f16(half %a.val)`
			`store half %r.val, half addrspace(1)* %r`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}rint_v2f16`
			`; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]`
			`; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00			`; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]`
			`; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]`
			`; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]`
			`; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]`
			`; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]`
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00			`; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]`
			`; SI-NOT: v_and_b32`
AMDGPU: Allow SIShrinkInstructions to work in non-SSA Immediates can be folded as long as the immediate is a vreg. Also undo commuting instructions if it didn't fold an immediate. llvm-svn: 307575 2017-07-11 03:53:57 +08:00			`; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]`
AMDGPU: Implement f16 fround llvm-svn: 298730 2017-03-25 04:04:18 +08:00
[AMDGPU] Resubmit SDWA peephole: enable by default Reviewers: vpykhtin, rampitec, arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D31671 llvm-svn: 299654 2017-04-06 23:03:28 +08:00			`; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]`
			`; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1`
AMDGPU: Remove unnecessary ands when f16 is legal Add a new node to act as a fancy bitcast from f16 operations to i32 that implicitly zero the high 16-bits of the result. Alternatively could try making v2f16 legal and canonicalizing on build_vectors. llvm-svn: 299246 2017-04-01 03:53:03 +08:00			`; VI-NOT: v_and_b32`
AMDGPU: Allow SIShrinkInstructions to work in non-SSA Immediates can be folded as long as the immediate is a vreg. Also undo commuting instructions if it didn't fold an immediate. llvm-svn: 307575 2017-07-11 03:53:57 +08:00			`; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]`
AMDGPU: Implement f16 fround llvm-svn: 298730 2017-03-25 04:04:18 +08:00
			`; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]`
[AMDGPU] SDWA: add support for GFX9 in peephole pass Summary: Added support based on merged SDWA pseudo instructions. Now peephole allow one scalar operand, omod and clamp modifiers. Added several subtarget features for GFX9 SDWA. This diff also contains changes from D34026. Depends D34026 Reviewers: vpykhtin, rampitec, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34241 llvm-svn: 305986 2017-06-22 14:26:41 +08:00			`; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1`
AMDGPU/GFX9: Do not use v_pack_b32_f16 when packing Differential Revision: https://reviews.llvm.org/D31819 llvm-svn: 300275 2017-04-14 07:17:00 +08:00			`; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]`
			`; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]`
AMDGPU: Implement f16 fround llvm-svn: 298730 2017-03-25 04:04:18 +08:00
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`; GCN: buffer_store_dword v[[R_V2_F16]]`
			`; GCN: s_endpgm`
[AMDGPU] SDWA: add support for GFX9 in peephole pass Summary: Added support based on merged SDWA pseudo instructions. Now peephole allow one scalar operand, omod and clamp modifiers. Added several subtarget features for GFX9 SDWA. This diff also contains changes from D34026. Depends D34026 Reviewers: vpykhtin, rampitec, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34241 llvm-svn: 305986 2017-06-22 14:26:41 +08:00
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @rint_v2f16(`
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 llvm-svn: 286753 2016-11-13 15:01:11 +08:00			`<2 x half> addrspace(1)* %r,`
			`<2 x half> addrspace(1)* %a) {`
			`entry:`
			`%a.val = load <2 x half>, <2 x half> addrspace(1)* %a`
			`%r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val)`
			`store <2 x half> %r.val, <2 x half> addrspace(1)* %r`
			`ret void`
			`}`