llvm-project/llvm/test/CodeGen/AMDGPU/fneg.f16.ll

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,GFX89 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s

; FIXME: Should be able to do scalar op
; GCN-LABEL: {{^}}s_fneg_f16:
define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
  %fneg = fsub half -0.0, %in
  store half %fneg, half addrspace(1)* %out
  ret void
}

; FIXME: Should be able to use bit operations when illegal type as
; well.

; GCN-LABEL: {{^}}v_fneg_f16:
; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]],
; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
; SI: buffer_store_short [[XOR]]
define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
  %val = load half, half addrspace(1)* %gep.in, align 2
  %fneg = fsub half -0.0, %val
  store half %fneg, half addrspace(1)* %gep.out
  ret void
}

; GCN-LABEL: {{^}}fneg_free_f16:
; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]],

; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
  %bc = bitcast i16 %in to half
  %fsub = fsub half -0.0, %bc
  store half %fsub, half addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}v_fneg_fold_f16:
; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]]

; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]]
; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]

; VI-NOT: [[NEG_VALUE]]
; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
  %val = load half, half addrspace(1)* %in
  %fsub = fsub half -0.0, %val
  %fmul = fmul half %fsub, %val
  store half %fmul, half addrspace(1)* %out
  ret void
}

; FIXME: Terrible code with SI/CI.
; FIXME: scalar for VI, vector for gfx9
; GCN-LABEL: {{^}}s_fneg_v2f16:
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
; CI: v_or_b32_e32

; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000

; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
  store <2 x half> %fneg, <2 x half> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}v_fneg_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
  ret void
}

; GCN-LABEL: {{^}}fneg_free_v2f16:
; GCN: s_load_dword [[VAL:s[0-9]+]]
; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000

; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
  %bc = bitcast i32 %in to <2 x half>
  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
  store <2 x half> %fsub, <2 x half> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}v_fneg_fold_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]

; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32
; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32

; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}

; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
  %val = load <2 x half>, <2 x half> addrspace(1)* %in
  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
  %fmul = fmul <2 x half> %fsub, %val
  store <2 x half> %fmul, <2 x half> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}

; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
  %val = load <2 x half>, <2 x half> addrspace(1)* %in
  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
  %elt0 = extractelement <2 x half> %fneg, i32 0
  %elt1 = extractelement <2 x half> %fneg, i32 1

  %fmul0 = fmul half %elt0, 4.0
  %fadd1 = fadd half %elt1, 2.0
  store volatile half %fmul0, half addrspace(1)* undef
  store volatile half %fadd1, half addrspace(1)* undef
  ret void
}

; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]
; CIVI: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]
; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[NEG]], off
define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
  %val = load <2 x half>, <2 x half> addrspace(1)* %in
  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
  %elt0 = extractelement <2 x half> %fneg, i32 0
  %elt1 = extractelement <2 x half> %fneg, i32 1
  store volatile half %elt0, half addrspace(1)* undef
  store volatile half %elt1, half addrspace(1)* undef
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
AMDGPU: Match store d16_hi instructions llvm-svn: 313712 2017-09-20 11:20:09 +08:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s`
			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,GFX89 %s`
AMDGPU/GCN: Bring processors in sync with AMDGPUUsage - Add gfx704 - Change bonaire to gfx704 - Remove gfx804 - Remove gfx901 - Remove gfx903 Differential Revision: https://reviews.llvm.org/D40046 llvm-svn: 320194 2017-12-09 04:52:28 +08:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00
			`; FIXME: Should be able to do scalar op`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GCN-LABEL: {{^}}s_fneg_f16:`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`%fneg = fsub half -0.0, %in`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00			`store half %fneg, half addrspace(1)* %out`
			`ret void`
			`}`

			`; FIXME: Should be able to use bit operations when illegal type as`
			`; well.`

AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GCN-LABEL: {{^}}v_fneg_f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]],`
AMDGPU: Use source modifiers with f16->f32 conversions The operand types were defined to fit the fp16_to_fp node, which has the half as an integer type. v_cvt_f32_f16 does support source modifiers, so change this to have an FP type and modifiers. For targets without legal f16, this requires recognizing the bit operations and trying to produce them. llvm-svn: 293857 2017-02-02 10:27:04 +08:00			`; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00			`; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]`
AMDGPU: Use source modifiers with f16->f32 conversions The operand types were defined to fit the fp16_to_fp node, which has the half as an integer type. v_cvt_f32_f16 does support source modifiers, so change this to have an FP type and modifiers. For targets without legal f16, this requires recognizing the bit operations and trying to produce them. llvm-svn: 293857 2017-02-02 10:27:04 +08:00			`; SI: buffer_store_short [[XOR]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`%tid = call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid`
			`%val = load half, half addrspace(1)* %gep.in, align 2`
			`%fneg = fsub half -0.0, %val`
			`store half %fneg, half addrspace(1)* %gep.out`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00			`ret void`
			`}`

AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GCN-LABEL: {{^}}fneg_free_f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN: {{flat\|global}}_load_ushort [[NEG_VALUE:v[0-9]+]],`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00
			`; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}`
			`; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]`
			`; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00			`%bc = bitcast i16 %in to half`
			`%fsub = fsub half -0.0, %bc`
			`store half %fsub, half addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GCN-LABEL: {{^}}v_fneg_fold_f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN: {{flat\|global}}_load_ushort [[NEG_VALUE:v[0-9]+]]`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00
AMDGPU: Use source modifiers with f16->f32 conversions The operand types were defined to fit the fp16_to_fp node, which has the half as an integer type. v_cvt_f32_f16 does support source modifiers, so change this to have an FP type and modifiers. For targets without legal f16, this requires recognizing the bit operations and trying to produce them. llvm-svn: 293857 2017-02-02 10:27:04 +08:00			`; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]`
			`; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]`
AMDGPU: Allow SIShrinkInstructions to work in non-SSA Immediates can be folded as long as the immediate is a vreg. Also undo commuting instructions if it didn't fold an immediate. llvm-svn: 307575 2017-07-11 03:53:57 +08:00			`; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]]`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00			`; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]`
			`; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]`

			`; VI-NOT: [[NEG_VALUE]]`
			`; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {`
AMDGPU: Fix f16 fabs/fneg llvm-svn: 286931 2016-11-15 10:25:28 +08:00			`%val = load half, half addrspace(1)* %in`
			`%fsub = fsub half -0.0, %val`
			`%fmul = fmul half %fsub, %val`
			`store half %fmul, half addrspace(1)* %out`
			`ret void`
			`}`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00
AMDGPU: Make v2i16/v2f16 legal on VI This usually results in better code. Fixes using inline asm with short2, and also fixes having a different ABI for function parameters between VI and gfx9. Partially cleans up the mess used for lowering of the d16 operations. Making v4f16 legal will help clean this up more, but this requires additional work. llvm-svn: 332953 2018-05-22 14:32:10 +08:00			`; FIXME: Terrible code with SI/CI.`
			`; FIXME: scalar for VI, vector for gfx9`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GCN-LABEL: {{^}}s_fneg_v2f16:`
			`; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}`
			`; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}`
			`; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}`
			`; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}`
			`; CI: v_or_b32_e32`

AMDGPU: Make v2i16/v2f16 legal on VI This usually results in better code. Fixes using inline asm with short2, and also fixes having a different ABI for function parameters between VI and gfx9. Partially cleans up the mess used for lowering of the d16 operations. Making v4f16 legal will help clean this up more, but this requires additional work. llvm-svn: 332953 2018-05-22 14:32:10 +08:00			`; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00
			`; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`%fneg = fsub <2 x half> <half -0.0, half -0.0>, %in`
			`store <2 x half> %fneg, <2 x half> addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}v_fneg_v2f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN: {{flat\|global}}_load_dword [[VAL:v[0-9]+]]`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`%tid = call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid`
			`%val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2`
			`%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val`
			`store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}fneg_free_v2f16:`
			`; GCN: s_load_dword [[VAL:s[0-9]+]]`
			`; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000`

			`; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]`
			`; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`%bc = bitcast i32 %in to <2 x half>`
			`%fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc`
			`store <2 x half> %fsub, <2 x half> addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}v_fneg_fold_v2f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN: {{flat\|global}}_load_dword [[VAL:v[0-9]+]]`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00
			`; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}`
			`; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}`
			`; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}`
			`; CI: v_cvt_f16_f32`
			`; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}`
			`; CI: v_cvt_f16_f32`

[AMDGPU] SDWA: add support for PRESERVE into SDWA peephole. Summary: Reviewers: arsenm, vpykhtin, rampitec Subscribers: kzhuravl, wdng, nhaehnle, mgorny, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D37817 llvm-svn: 319662 2017-12-05 00:22:32 +08:00			`; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}`

			`; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`%val = load <2 x half>, <2 x half> addrspace(1)* %in`
			`%fsub = fsub <2 x half> <half -0.0, half -0.0>, %val`
			`%fmul = fmul <2 x half> %fsub, %val`
			`store <2 x half> %fmul, <2 x half> addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Pull fneg out of extract_vector_elt This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. llvm-svn: 302813 2017-05-12 01:26:25 +08:00			`; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN-DAG: {{flat\|global}}_load_dword [[VAL:v[0-9]+]]`
AMDGPU: Pull fneg out of extract_vector_elt This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. llvm-svn: 302813 2017-05-12 01:26:25 +08:00			`; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}`
			`; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}`

			`; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]`
[AMDGPU] SDWA: add support for GFX9 in peephole pass Summary: Added support based on merged SDWA pseudo instructions. Now peephole allow one scalar operand, omod and clamp modifiers. Added several subtarget features for GFX9 SDWA. This diff also contains changes from D34026. Depends D34026 Reviewers: vpykhtin, rampitec, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34241 llvm-svn: 305986 2017-06-22 14:26:41 +08:00			`; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000`
			`; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1`

AMDGPU: Pull fneg out of extract_vector_elt This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. llvm-svn: 302813 2017-05-12 01:26:25 +08:00			`define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {`
			`%val = load <2 x half>, <2 x half> addrspace(1)* %in`
			`%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val`
			`%elt0 = extractelement <2 x half> %fneg, i32 0`
			`%elt1 = extractelement <2 x half> %fneg, i32 1`

			`%fmul0 = fmul half %elt0, 4.0`
			`%fadd1 = fadd half %elt1, 2.0`
			`store volatile half %fmul0, half addrspace(1)* undef`
			`store volatile half %fadd1, half addrspace(1)* undef`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN: {{flat\|global}}_load_dword [[VAL:v[0-9]+]]`
AMDGPU: Pull fneg out of extract_vector_elt This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. llvm-svn: 302813 2017-05-12 01:26:25 +08:00			`; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]`
AMDGPU: Match store d16_hi instructions llvm-svn: 313712 2017-09-20 11:20:09 +08:00			`; CIVI: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]`
			`; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[NEG]], off`
AMDGPU: Pull fneg out of extract_vector_elt This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. llvm-svn: 302813 2017-05-12 01:26:25 +08:00			`define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {`
			`%val = load <2 x half>, <2 x half> addrspace(1)* %in`
			`%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val`
			`%elt0 = extractelement <2 x half> %fneg, i32 0`
			`%elt1 = extractelement <2 x half> %fneg, i32 1`
			`store volatile half %elt0, half addrspace(1)* undef`
			`store volatile half %elt1, half addrspace(1)* undef`
			`ret void`
			`}`

AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`declare i32 @llvm.amdgcn.workitem.id.x() #1`

			`attributes #0 = { nounwind }`
			`attributes #1 = { nounwind readnone }`