llvm-project/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; The bitcast should be pushed through the bitcasts so the vectors can
; be broken down and the shared components can be CSEd

; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
  %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out

  %vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out

  %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
  store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out

  %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
  store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
  %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out

  %vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
; GCN-NOT: store_dword
define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
  %bc = bitcast i64 %undef to <2 x i32>
  store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
; GCN-NOT: store_dword
define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
  %bc = bitcast i64 %undef to <2 x i32>
  %elt1 = extractelement <2 x i32> %bc, i32 1
  store volatile i32 %elt1, i32 addrspace(1)* %out
  ret void
}

declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone convergent }
AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822 2016-09-17 23:44:16 +08:00			`; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`
Enable FeatureFlatForGlobal on Volcanic Islands This switches to the workaround that HSA defaults to for the mesa path. This should be applied to the 4.0 branch. Patch by Vedran Miletić <vedran@miletic.net> llvm-svn: 292982 2017-01-25 06:02:15 +08:00			`; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`
AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822 2016-09-17 23:44:16 +08:00
			`; The bitcast should be pushed through the bitcasts so the vectors can`
			`; be broken down and the shared components can be CSEd`

			`; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:`
			`; GCN: buffer_store_dwordx4`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {`
AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822 2016-09-17 23:44:16 +08:00			`%vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>`
			`store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out`

			`%vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>`
			`store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:`
			`; GCN: buffer_store_dwordx4`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {`
AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822 2016-09-17 23:44:16 +08:00			`%vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>`
			`store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out`

			`%vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>`
			`store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:`
			`; GCN: buffer_store_dwordx4`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {`
AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822 2016-09-17 23:44:16 +08:00			`%vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>`
			`store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out`

			`%vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>`
			`store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:`
			`; GCN: buffer_store_dwordx4`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
			`; GCN-NOT: v_mov_b32`
			`; GCN: buffer_store_dwordx4`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {`
AMDGPU: Push bitcasts through build_vector This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822 2016-09-17 23:44:16 +08:00			`%vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>`
			`store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out`

			`%vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>`
			`store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out`
			`ret void`
			`}`
DAG: Fold bitcast/extract_vector_elt of undef to undef Fixes not eliminating store when intrinsic is lowered to undef. llvm-svn: 298385 2017-03-22 00:20:16 +08:00
			`; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:`
			`; GCN-NOT: store_dword`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {`
DAG: Fold bitcast/extract_vector_elt of undef to undef Fixes not eliminating store when intrinsic is lowered to undef. llvm-svn: 298385 2017-03-22 00:20:16 +08:00			`%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1`
			`%bc = bitcast i64 %undef to <2 x i32>`
			`store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:`
			`; GCN-NOT: store_dword`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {`
DAG: Fold bitcast/extract_vector_elt of undef to undef Fixes not eliminating store when intrinsic is lowered to undef. llvm-svn: 298385 2017-03-22 00:20:16 +08:00			`%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1`
			`%bc = bitcast i64 %undef to <2 x i32>`
			`%elt1 = extractelement <2 x i32> %bc, i32 1`
			`store volatile i32 %elt1, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1`

			`attributes #0 = { nounwind }`
			`attributes #1 = { nounwind readnone convergent }`