llvm-project/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s

; GCN-LABEL: {{^}}trunc_i64_bitcast_v2i32:
; GCN: buffer_load_dword v
; GCN: buffer_store_dword v
define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
  %bc = bitcast <2 x i32> %ld to i64
  %trunc = trunc i64 %bc to i32
  store i32 %trunc, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}trunc_i96_bitcast_v3i32:
; GCN: buffer_load_dword v
; GCN: buffer_store_dword v
define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
  %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
  %bc = bitcast <3 x i32> %ld to i96
  %trunc = trunc i96 %bc to i32
  store i32 %trunc, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}trunc_i128_bitcast_v4i32:
; GCN: buffer_load_dword v
; GCN: buffer_store_dword v
define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
  %bc = bitcast <4 x i32> %ld to i128
  %trunc = trunc i128 %bc to i32
  store i32 %trunc, i32 addrspace(1)* %out
  ret void
}

; Don't want load width reduced in this case.
; GCN-LABEL: {{^}}trunc_i16_bitcast_v2i16:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: buffer_store_short [[VAL]]
define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
  %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
  %bc = bitcast <2 x i16> %ld to i32
  %trunc = trunc i32 %bc to i16
  store i16 %trunc, i16 addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}trunc_i16_bitcast_v4i16:
; FIXME We need to teach the dagcombiner to reduce load width for:
;   t21: v2i32,ch = load<LD8[%in(addrspace=1)]> t12, t10, undef:i64
;        t23: i64 = bitcast t21
;      t30: i16 = truncate t23
; SI: buffer_load_dword v[[VAL:[0-9]+]]
; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
; GCN: buffer_store_short v[[VAL]], off
define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
  %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
  %bc = bitcast <4 x i16> %ld to i64
  %trunc = trunc i64 %bc to i16
  store i16 %trunc, i16 addrspace(1)* %out
  ret void
}

; FIXME: Consistently shrink or not here
; GCN-LABEL: {{^}}trunc_i8_bitcast_v2i8:
; SI: buffer_load_ubyte [[VAL:v[0-9]+]]
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
; GCN: buffer_store_byte [[VAL]]
define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
  %bc = bitcast <2 x i8> %ld to i16
  %trunc = trunc i16 %bc to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}trunc_i32_bitcast_v4i8:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: buffer_store_byte [[VAL]]
define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
  %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
  %bc = bitcast <4 x i8> %ld to i32
  %trunc = trunc i32 %bc to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}

; GCN-LABEL: {{^}}trunc_i24_bitcast_v3i8:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: buffer_store_byte [[VAL]]
define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
  %bc = bitcast <3 x i8> %ld to i24
  %trunc = trunc i24 %bc to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}
AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s`
			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI %s`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00
AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i64_bitcast_v2i32:`
			`; GCN: buffer_load_dword v`
			`; GCN: buffer_store_dword v`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in`
			`%bc = bitcast <2 x i32> %ld to i64`
			`%trunc = trunc i64 %bc to i32`
			`store i32 %trunc, i32 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i96_bitcast_v3i32:`
			`; GCN: buffer_load_dword v`
			`; GCN: buffer_store_dword v`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <3 x i32>, <3 x i32> addrspace(1)* %in`
			`%bc = bitcast <3 x i32> %ld to i96`
			`%trunc = trunc i96 %bc to i32`
			`store i32 %trunc, i32 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i128_bitcast_v4i32:`
			`; GCN: buffer_load_dword v`
			`; GCN: buffer_store_dword v`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in`
			`%bc = bitcast <4 x i32> %ld to i128`
			`%trunc = trunc i128 %bc to i32`
			`store i32 %trunc, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; Don't want load width reduced in this case.`
AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i16_bitcast_v2i16:`
			`; GCN: buffer_load_dword [[VAL:v[0-9]+]]`
			`; GCN: buffer_store_short [[VAL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <2 x i16>, <2 x i16> addrspace(1)* %in`
			`%bc = bitcast <2 x i16> %ld to i32`
			`%trunc = trunc i32 %bc to i16`
			`store i16 %trunc, i16 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i16_bitcast_v4i16:`
AMDGPU: Add VI i16 support Patch By: Wei Ding Differential Revision: https://reviews.llvm.org/D18049 llvm-svn: 286464 2016-11-11 00:02:37 +08:00			`; FIXME We need to teach the dagcombiner to reduce load width for:`
			`; t21: v2i32,ch = load<LD8[%in(addrspace=1)]> t12, t10, undef:i64`
			`; t23: i64 = bitcast t21`
			`; t30: i16 = truncate t23`
			`; SI: buffer_load_dword v[[VAL:[0-9]+]]`
			`; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]`
AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN: buffer_store_short v[[VAL]], off`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <4 x i16>, <4 x i16> addrspace(1)* %in`
			`%bc = bitcast <4 x i16> %ld to i64`
			`%trunc = trunc i64 %bc to i16`
			`store i16 %trunc, i16 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; FIXME: Consistently shrink or not here`
			`; GCN-LABEL: {{^}}trunc_i8_bitcast_v2i8:`
			`; SI: buffer_load_ubyte [[VAL:v[0-9]+]]`
			`; VI: buffer_load_ushort [[VAL:v[0-9]+]]`
			`; GCN: buffer_store_byte [[VAL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <2 x i8>, <2 x i8> addrspace(1)* %in`
			`%bc = bitcast <2 x i8> %ld to i16`
			`%trunc = trunc i16 %bc to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i32_bitcast_v4i8:`
			`; GCN: buffer_load_dword [[VAL:v[0-9]+]]`
			`; GCN: buffer_store_byte [[VAL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <4 x i8>, <4 x i8> addrspace(1)* %in`
			`%bc = bitcast <4 x i8> %ld to i32`
			`%trunc = trunc i32 %bc to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`

AMDGPU: Fix broken check lines in test llvm-svn: 331796 2018-05-09 02:43:44 +08:00			`; GCN-LABEL: {{^}}trunc_i24_bitcast_v3i8:`
			`; GCN: buffer_load_dword [[VAL:v[0-9]+]]`
			`; GCN: buffer_store_byte [[VAL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {`
DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 2016-03-02 05:31:53 +08:00			`%ld = load <3 x i8>, <3 x i8> addrspace(1)* %in`
			`%bc = bitcast <3 x i8> %ld to i24`
			`%trunc = trunc i24 %bc to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`