llvm-project/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noop...

; RUN: llc -O0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s

; FIXME: Merge into indirect-addressing-si.ll

; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
; of the tied implicit use and def of the super register.

; CHECK-LABEL: {{^}}insert_wo_offset:
; CHECK: s_load_dword [[IN:s[0-9]+]]
; CHECK: s_mov_b32 m0, [[IN]]
; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
entry:
  %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
  store <4 x float> %ins, <4 x float> addrspace(1)* %out
  ret void
}

; Make sure we don't hit use of undefined register errors when expanding an
; extract with undef index.

; CHECK-LABEL: {{^}}extract_adjacent_blocks:
; CHECK: s_load_dword [[ARG:s[0-9]+]]
; CHECK: s_cmp_lg_u32
; CHECK: s_cbranch_scc1 [[BB4:BB[0-9]+_[0-9]+]]

; CHECK: buffer_load_dwordx4
; CHECK: v_cndmask_b32_e64
; CHECK: v_cndmask_b32_e64
; CHECK: v_cndmask_b32_e64

; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]

; CHECK: [[BB4]]:
; CHECK: buffer_load_dwordx4
; CHECK: v_cndmask_b32_e64
; CHECK: v_cndmask_b32_e64
; CHECK: v_cndmask_b32_e64

; CHECK: [[ENDBB]]:
; CHECK: buffer_store_dword
; CHECK: s_endpgm

define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
bb:
  %tmp = icmp eq i32 %arg, 0
  br i1 %tmp, label %bb1, label %bb4

bb1:
  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
  br label %bb7

bb4:
  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
  br label %bb7

bb7:
  %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
  store volatile float %tmp8, float addrspace(1)* undef
  ret void
}
AMDGPU: Cleanup subtarget features Try to avoid mutually exclusive features. Don't use a real default GPU, and use a fake "generic". The goal is to make it easier to see which set of features are incompatible between feature strings. Most of the test changes are due to random scheduling changes from not having a default fullspeed model. llvm-svn: 310258 2017-08-07 22:58:04 +08:00			`; RUN: llc -O0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck %s`
Replace subregister uses when processing tied operands This was for some reason skipping operands that are subregisters instead of keeping the same subregister index. v_movreld_b32 expects src0 to be the subregister of the tied super register use/def. e.g. v_movreld_b32 v0, v9, <imp-def, tied3> v[0:3], <imp-use, tied2> v[0:3] was being replaced with v[4:7] = copy v[0:3] v_movreld_b32 v0, v9, <imp-def, tied3> v[4:7], <imp-use, tied2> v[4:7], which really writes to v[0:3] llvm-svn: 279804 2016-08-26 14:31:32 +08:00
			`; FIXME: Merge into indirect-addressing-si.ll`

			`; Make sure that TwoAddressInstructions keeps src0 as subregister sub0`
			`; of the tied implicit use and def of the super register.`

			`; CHECK-LABEL: {{^}}insert_wo_offset:`
			`; CHECK: s_load_dword [[IN:s[0-9]+]]`
			`; CHECK: s_mov_b32 m0, [[IN]]`
			`; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]`
			`; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {`
Replace subregister uses when processing tied operands This was for some reason skipping operands that are subregisters instead of keeping the same subregister index. v_movreld_b32 expects src0 to be the subregister of the tied super register use/def. e.g. v_movreld_b32 v0, v9, <imp-def, tied3> v[0:3], <imp-use, tied2> v[0:3] was being replaced with v[4:7] = copy v[0:3] v_movreld_b32 v0, v9, <imp-def, tied3> v[4:7], <imp-use, tied2> v[4:7], which really writes to v[0:3] llvm-svn: 279804 2016-08-26 14:31:32 +08:00			`entry:`
			`%ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in`
			`store <4 x float> %ins, <4 x float> addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU Tests: Change a case to be run with -O0 D40231 requires to run case with -O0 to prevent InstructionSimplify from transforming an extractelement with undef index. llvm-svn: 319907 2017-12-07 01:40:09 +08:00			`; Make sure we don't hit use of undefined register errors when expanding an`
			`; extract with undef index.`

			`; CHECK-LABEL: {{^}}extract_adjacent_blocks:`
			`; CHECK: s_load_dword [[ARG:s[0-9]+]]`
			`; CHECK: s_cmp_lg_u32`
			`; CHECK: s_cbranch_scc1 [[BB4:BB[0-9]+_[0-9]+]]`

			`; CHECK: buffer_load_dwordx4`
[AMDGPU] combine extractelement into several selects An extractelement with non-constant index will be lowered either to scratch or movrel loop in most cases. This patch converts such instruction into a set of selects if vector size is not too big. Differential Revision: https://reviews.llvm.org/D54351 llvm-svn: 346800 2018-11-14 05:18:21 +08:00			`; CHECK: v_cndmask_b32_e64`
			`; CHECK: v_cndmask_b32_e64`
			`; CHECK: v_cndmask_b32_e64`
AMDGPU Tests: Change a case to be run with -O0 D40231 requires to run case with -O0 to prevent InstructionSimplify from transforming an extractelement with undef index. llvm-svn: 319907 2017-12-07 01:40:09 +08:00
			`; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]`

			`; CHECK: [[BB4]]:`
			`; CHECK: buffer_load_dwordx4`
[AMDGPU] combine extractelement into several selects An extractelement with non-constant index will be lowered either to scratch or movrel loop in most cases. This patch converts such instruction into a set of selects if vector size is not too big. Differential Revision: https://reviews.llvm.org/D54351 llvm-svn: 346800 2018-11-14 05:18:21 +08:00			`; CHECK: v_cndmask_b32_e64`
			`; CHECK: v_cndmask_b32_e64`
			`; CHECK: v_cndmask_b32_e64`
AMDGPU Tests: Change a case to be run with -O0 D40231 requires to run case with -O0 to prevent InstructionSimplify from transforming an extractelement with undef index. llvm-svn: 319907 2017-12-07 01:40:09 +08:00
			`; CHECK: [[ENDBB]]:`
			`; CHECK: buffer_store_dword`
			`; CHECK: s_endpgm`

			`define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {`
			`bb:`
			`%tmp = icmp eq i32 %arg, 0`
			`br i1 %tmp, label %bb1, label %bb4`

			`bb1:`
			`%tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef`
			`%tmp3 = extractelement <4 x float> %tmp2, i32 undef`
			`call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out`
			`br label %bb7`

			`bb4:`
			`%tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef`
			`%tmp6 = extractelement <4 x float> %tmp5, i32 undef`
			`call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out`
			`br label %bb7`

			`bb7:`
			`%tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]`
			`store volatile float %tmp8, float addrspace(1)* undef`
			`ret void`
			`}`