llvm-project/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll

; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s

; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]
; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
; GCN-DAG: buffer_store_short [[VELT0]]
; GCN-DAG: buffer_store_short [[VELT1]]
define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %p0 = extractelement <2 x i16> %vec, i32 0
  %p1 = extractelement <2 x i16> %vec, i32 1
  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
  store i16 %p1, i16 addrspace(1)* %out, align 2
  store i16 %p0, i16 addrspace(1)* %out1, align 2
  ret void
}

; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr:
; GCN: s_load_dword [[IDX:s[0-9]+]]
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 16
; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]]
; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
; GCN: buffer_store_short [[VELT1]]
; GCN: ScratchSize: 0
define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt = extractelement <2 x i16> %vec, i32 %idx
  store i16 %elt, i16 addrspace(1)* %out, align 2
  ret void
}

; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr:
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]]
; GCN-DAG: {{flat|buffer|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 16, [[IDX]]

; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]]
; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]]

; SI: buffer_store_short [[ELT]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
; GCN: ScratchSize: 0{{$}}
define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
  %idx = load volatile i32, i32 addrspace(1)* %gep
  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt = extractelement <2 x i16> %vec, i32 %idx
  store i16 %elt, i16 addrspace(1)* %out.gep, align 2
  ret void
}

; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
; GCN: buffer_load_ushort
; GCN: buffer_store_short
; GCN: buffer_store_short
define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
  %p0 = extractelement <3 x i16> %foo, i32 0
  %p1 = extractelement <3 x i16> %foo, i32 2
  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  store i16 %p1, i16 addrspace(1)* %out, align 2
  store i16 %p0, i16 addrspace(1)* %out1, align 2
  ret void
}

; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short

; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]]
; GFX9-DAG: buffer_store_short [[VLOAD0]], off
; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
; GFX9-DAG: buffer_store_short [[VLOAD1]], off
define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
  %p0 = extractelement <4 x i16> %foo, i32 0
  %p1 = extractelement <4 x i16> %foo, i32 2
  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
  store volatile i16 %p1, i16 addrspace(1)* %out, align 2
  store volatile i16 %p0, i16 addrspace(1)* %out1, align 2
  ret void
}

; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort

; SICIVI: buffer_store_short
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short

; SICIVI: buffer_load_ushort
; SICIVI: buffer_store_short

; GFX9: buffer_load_ushort
; GFX9: global_load_short_d16_hi
; GFX9: global_load_short_d16 v
; GFX9: buffer_store_dword
; GFX9: buffer_store_dword
; GFX9: buffer_load_ushort
; GFX9: buffer_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
  %p0 = extractelement <3 x i16> %foo, i32 %idx
  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  store i16 %p0, i16 addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort

; SICIVI: buffer_store_short
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short

; SICIVI: buffer_load_ushort
; SICIVI: buffer_store_short

; GFX9: s_load_dword
; GFX9: buffer_store_dword
; GFX9: buffer_store_dword
; GFX9: buffer_load_ushort
; GFX9: buffer_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
  %p0 = extractelement <4 x i16> %foo, i32 %idx
  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  store i16 %p0, i16 addrspace(1)* %out
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
CodeGen: Fix pointer info in SplitVecOp_EXTRACT_VECTOR_ELT/SplitVecRes_INSERT_VECTOR_ELT Two issues found when doing codegen for splitting vector with non-zero alloca addr space: DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT/SplitVecOp_EXTRACT_VECTOR_ELT uses dummy pointer info for creating SDStore. Since one pointer operand contains multiply and add, InferPointerInfo is unable to infer the correct pointer info, which ends up with a dummy pointer info for the target to lower store and results in isel failure. The fix is to introduce MachinePointerInfo::getUnknownStack to represent MachinePointerInfo which is known in alloca address space but without other information. TargetLowering::getVectorElementPointer uses value type of pointer in addr space 0 for multiplication of index and then add it to the pointer. However the pointer may be in an addr space which has different size than addr space 0. The fix is to use the pointer value type for index multiplication. Differential Revision: https://reviews.llvm.org/D39758 llvm-svn: 319622 2017-12-03 06:13:22 +08:00			`; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s`
			`; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s`
AMDGPU/GCN: Bring processors in sync with AMDGPUUsage - Add gfx704 - Change bonaire to gfx704 - Remove gfx804 - Remove gfx901 - Remove gfx903 Differential Revision: https://reviews.llvm.org/D40046 llvm-svn: 320194 2017-12-09 04:52:28 +08:00			`; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX9 %s`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00
AMDGPU: Fix broken check prefix in test llvm-svn: 290050 2016-12-18 04:03:59 +08:00			`; GCN-LABEL: {{^}}extract_vector_elt_v2i16:`
AMDGPU: Custom lower more vector operations This avoids stack usage. llvm-svn: 292846 2017-01-24 07:09:58 +08:00			`; GCN: s_load_dword [[VEC:s[0-9]+]]`
			`; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16`
			`; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]`
			`; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]`
			`; GCN-DAG: buffer_store_short [[VELT0]]`
			`; GCN-DAG: buffer_store_short [[VELT1]]`
[AMDGPU] Change constant addr space to 4 Differential Revision: https://reviews.llvm.org/D43170 llvm-svn: 325030 2018-02-14 02:00:25 +08:00			`define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {`
			`%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr`
AMDGPU: Custom lower more vector operations This avoids stack usage. llvm-svn: 292846 2017-01-24 07:09:58 +08:00			`%p0 = extractelement <2 x i16> %vec, i32 0`
			`%p1 = extractelement <2 x i16> %vec, i32 1`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10`
			`store i16 %p1, i16 addrspace(1)* %out, align 2`
			`store i16 %p0, i16 addrspace(1)* %out1, align 2`
			`ret void`
			`}`

DAG: Don't fold vector extract into load if target doesn't want to Fixes turning a 32-bit scalar load into an extending vector load for AMDGPU when dynamically indexing a vector. llvm-svn: 292842 2017-01-24 06:48:53 +08:00			`; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr:`
			`; GCN: s_load_dword [[IDX:s[0-9]+]]`
AMDGPU: Custom lower more vector operations This avoids stack usage. llvm-svn: 292846 2017-01-24 07:09:58 +08:00			`; GCN: s_load_dword [[VEC:s[0-9]+]]`
			`; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 16`
			`; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]]`
			`; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]`
			`; GCN: buffer_store_short [[VELT1]]`
			`; GCN: ScratchSize: 0`
[AMDGPU] Change constant addr space to 4 Differential Revision: https://reviews.llvm.org/D43170 llvm-svn: 325030 2018-02-14 02:00:25 +08:00			`define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {`
			`%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr`
DAG: Don't fold vector extract into load if target doesn't want to Fixes turning a 32-bit scalar load into an extending vector load for AMDGPU when dynamically indexing a vector. llvm-svn: 292842 2017-01-24 06:48:53 +08:00			`%elt = extractelement <2 x i16> %vec, i32 %idx`
			`store i16 %elt, i16 addrspace(1)* %out, align 2`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr:`
AMDGPU: Custom lower more vector operations This avoids stack usage. llvm-svn: 292846 2017-01-24 07:09:58 +08:00			`; GCN-DAG: s_load_dword [[VEC:s[0-9]+]]`
AMDGPU: Start selecting global instructions llvm-svn: 309470 2017-07-29 09:03:53 +08:00			`; GCN-DAG: {{flat\|buffer\|global}}_load_dword [[IDX:v[0-9]+]]`
AMDGPU: Custom lower more vector operations This avoids stack usage. llvm-svn: 292846 2017-01-24 07:09:58 +08:00			`; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 16, [[IDX]]`

			`; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]]`
			`; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]]`

			`; SI: buffer_store_short [[ELT]]`
			`; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]`
			`; GCN: ScratchSize: 0{{$}}`
[AMDGPU] Change constant addr space to 4 Differential Revision: https://reviews.llvm.org/D43170 llvm-svn: 325030 2018-02-14 02:00:25 +08:00			`define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {`
DAG: Don't fold vector extract into load if target doesn't want to Fixes turning a 32-bit scalar load into an extending vector load for AMDGPU when dynamically indexing a vector. llvm-svn: 292842 2017-01-24 06:48:53 +08:00			`%tid = call i32 @llvm.amdgcn.workitem.id.x()`
			`%tid.ext = sext i32 %tid to i64`
			`%gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext`
			`%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext`
			`%idx = load volatile i32, i32 addrspace(1)* %gep`
[AMDGPU] Change constant addr space to 4 Differential Revision: https://reviews.llvm.org/D43170 llvm-svn: 325030 2018-02-14 02:00:25 +08:00			`%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr`
DAG: Don't fold vector extract into load if target doesn't want to Fixes turning a 32-bit scalar load into an extending vector load for AMDGPU when dynamically indexing a vector. llvm-svn: 292842 2017-01-24 06:48:53 +08:00			`%elt = extractelement <2 x i16> %vec, i32 %idx`
			`store i16 %elt, i16 addrspace(1)* %out.gep, align 2`
			`ret void`
			`}`

AMDGPU: Fix broken check prefix in test llvm-svn: 290050 2016-12-18 04:03:59 +08:00			`; GCN-LABEL: {{^}}extract_vector_elt_v3i16:`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`; GCN: buffer_load_ushort`
			`; GCN: buffer_store_short`
			`; GCN: buffer_store_short`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`%p0 = extractelement <3 x i16> %foo, i32 0`
			`%p1 = extractelement <3 x i16> %foo, i32 2`
			`%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1`
			`store i16 %p1, i16 addrspace(1)* %out, align 2`
			`store i16 %p0, i16 addrspace(1)* %out1, align 2`
			`ret void`
			`}`

AMDGPU: Fix broken check prefix in test llvm-svn: 290050 2016-12-18 04:03:59 +08:00			`; GCN-LABEL: {{^}}extract_vector_elt_v4i16:`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_store_short`
			`; SICIVI: buffer_store_short`

			`; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c`
			`; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30`
			`; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]]`
			`; GFX9-DAG: buffer_store_short [[VLOAD0]], off`
			`; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]`
			`; GFX9-DAG: buffer_store_short [[VLOAD1]], off`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`%p0 = extractelement <4 x i16> %foo, i32 0`
			`%p1 = extractelement <4 x i16> %foo, i32 2`
			`%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`store volatile i16 %p1, i16 addrspace(1)* %out, align 2`
			`store volatile i16 %p0, i16 addrspace(1)* %out1, align 2`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`ret void`
			`}`

AMDGPU: Fix broken check prefix in test llvm-svn: 290050 2016-12-18 04:03:59 +08:00			`; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:`
AMDGPU: Match load d16 hi instructions Also starts selecting global loads for constant address in some cases. Some end up selecting to mubuf still, which requires investigation. We still get sub-optimal regalloc and extra waitcnts inserted due to not really tracking the liveness of the separate register halves. llvm-svn: 313716 2017-09-20 13:01:53 +08:00			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_load_ushort`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; SICIVI: buffer_store_short`
			`; SICIVI: buffer_store_short`
			`; SICIVI: buffer_store_short`

AMDGPU: Select d16 loads into low component of register llvm-svn: 318005 2017-11-13 08:22:09 +08:00			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_store_short`

AMDGPU: Match load d16 hi instructions Also starts selecting global loads for constant address in some cases. Some end up selecting to mubuf still, which requires investigation. We still get sub-optimal regalloc and extra waitcnts inserted due to not really tracking the liveness of the separate register halves. llvm-svn: 313716 2017-09-20 13:01:53 +08:00			`; GFX9: buffer_load_ushort`
			`; GFX9: global_load_short_d16_hi`
AMDGPU: Select d16 loads into low component of register llvm-svn: 318005 2017-11-13 08:22:09 +08:00			`; GFX9: global_load_short_d16 v`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; GFX9: buffer_store_dword`
			`; GFX9: buffer_store_dword`
AMDGPU: Select d16 loads into low component of register llvm-svn: 318005 2017-11-13 08:22:09 +08:00			`; GFX9: buffer_load_ushort`
			`; GFX9: buffer_store_short`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`%p0 = extractelement <3 x i16> %foo, i32 %idx`
			`%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1`
			`store i16 %p0, i16 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check prefix in test llvm-svn: 290050 2016-12-18 04:03:59 +08:00			`; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:`
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_load_ushort`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; SICIVI: buffer_store_short`
			`; SICIVI: buffer_store_short`
			`; SICIVI: buffer_store_short`
			`; SICIVI: buffer_store_short`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00
AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 2017-02-28 06:15:25 +08:00			`; SICIVI: buffer_load_ushort`
			`; SICIVI: buffer_store_short`

			`; GFX9: s_load_dword`
			`; GFX9: buffer_store_dword`
			`; GFX9: buffer_store_dword`
			`; GFX9: buffer_load_ushort`
			`; GFX9: buffer_store_short`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {`
AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`%p0 = extractelement <4 x i16> %foo, i32 %idx`
			`%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1`
			`store i16 %p0, i16 addrspace(1)* %out`
			`ret void`
			`}`

DAG: Don't fold vector extract into load if target doesn't want to Fixes turning a 32-bit scalar load into an extending vector load for AMDGPU when dynamically indexing a vector. llvm-svn: 292842 2017-01-24 06:48:53 +08:00			`declare i32 @llvm.amdgcn.workitem.id.x() #1`

AMDGPU: Cleanup vector insert/extract tests This mostly makes sure that 3-vector dynamic inserts and extracts are covered. llvm-svn: 271082 2016-05-28 08:51:06 +08:00			`attributes #0 = { nounwind }`
DAG: Don't fold vector extract into load if target doesn't want to Fixes turning a 32-bit scalar load into an extending vector load for AMDGPU when dynamically indexing a vector. llvm-svn: 292842 2017-01-24 06:48:53 +08:00			`attributes #1 = { nounwind readnone }`