llvm-project/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=MUBUF
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefix=FLATSCR

; Make sure there's no assertion from passing a 0 alignment value
define void @memcpy_fixed_align(i8 addrspace(5)*  %dst, i8 addrspace(1)* %src) {
; MUBUF-LABEL: memcpy_fixed_align:
; MUBUF:       ; %bb.0:
; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT:    global_load_dword v0, v[1:2], off offset:36
; MUBUF-NEXT:    global_load_dword v11, v[1:2], off offset:32
; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
; MUBUF-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
; MUBUF-NEXT:    s_waitcnt vmcnt(3)
; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:36
; MUBUF-NEXT:    s_waitcnt vmcnt(3)
; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32
; MUBUF-NEXT:    s_waitcnt vmcnt(3)
; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28
; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24
; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20
; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16
; MUBUF-NEXT:    s_waitcnt vmcnt(6)
; MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:12
; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:8
; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:4
; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32
; MUBUF-NEXT:    s_waitcnt vmcnt(0)
; MUBUF-NEXT:    s_setpc_b64 s[30:31]
;
; FLATSCR-LABEL: memcpy_fixed_align:
; FLATSCR:       ; %bb.0:
; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
; FLATSCR-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
; FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[11:12], s32 offset:32
; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[3:6], s32 offset:16
; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[7:10], s32
; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
  %alloca = alloca [40 x i8], addrspace(5)
  %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
  call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false)
  ret void
}

declare void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* noalias nocapture writeonly, i8 addrspace(1)* noalias nocapture readonly, i64, i1 immarg) #0

attributes #0 = { argmemonly nounwind willreturn }
DAG: Don't pass 0 alignment value to allowsMisalignedMemoryAccesses I think not unconditionally passing getDstAlign is broken, but leave that for another change. 2020-08-13 08:02:05 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s \| FileCheck %s -check-prefix=MUBUF`
			`; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-flat-scratch < %s \| FileCheck %s -check-prefix=FLATSCR`
DAG: Don't pass 0 alignment value to allowsMisalignedMemoryAccesses I think not unconditionally passing getDstAlign is broken, but leave that for another change. 2020-08-13 08:02:05 +08:00
			`; Make sure there's no assertion from passing a 0 alignment value`
			`define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) {`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; MUBUF-LABEL: memcpy_fixed_align:`
			`; MUBUF: ; %bb.0:`
			`; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)`
			`; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:36`
[amdgpu] Enable use of AA during codegen. - Add an internal option `-amdgpu-use-aa-in-codegen` to enable or disable this feature. By Default, it's enabled. Differential Revision: https://reviews.llvm.org/D89320 2020-10-12 11:51:53 +08:00			`; MUBUF-NEXT: global_load_dword v11, v[1:2], off offset:32`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16`
[amdgpu] Enable use of AA during codegen. - Add an internal option `-amdgpu-use-aa-in-codegen` to enable or disable this feature. By Default, it's enabled. Differential Revision: https://reviews.llvm.org/D89320 2020-10-12 11:51:53 +08:00			`; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off`
			`; MUBUF-NEXT: s_waitcnt vmcnt(3)`
			`; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36`
			`; MUBUF-NEXT: s_waitcnt vmcnt(3)`
			`; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32`
			`; MUBUF-NEXT: s_waitcnt vmcnt(3)`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28`
			`; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24`
			`; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20`
			`; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16`
[amdgpu] Enable use of AA during codegen. - Add an internal option `-amdgpu-use-aa-in-codegen` to enable or disable this feature. By Default, it's enabled. Differential Revision: https://reviews.llvm.org/D89320 2020-10-12 11:51:53 +08:00			`; MUBUF-NEXT: s_waitcnt vmcnt(6)`
			`; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12`
			`; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8`
			`; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4`
			`; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; MUBUF-NEXT: s_waitcnt vmcnt(0)`
			`; MUBUF-NEXT: s_setpc_b64 s[30:31]`
			`;`
			`; FLATSCR-LABEL: memcpy_fixed_align:`
			`; FLATSCR: ; %bb.0:`
			`; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)`
[AMDGPU] Enable multi-dword flat scratch load/stores Differential Revision: https://reviews.llvm.org/D91384 2020-11-13 05:04:33 +08:00			`; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16`
[amdgpu] Enable use of AA during codegen. - Add an internal option `-amdgpu-use-aa-in-codegen` to enable or disable this feature. By Default, it's enabled. Differential Revision: https://reviews.llvm.org/D89320 2020-10-12 11:51:53 +08:00			`; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off`
[AMDGPU] Enable multi-dword flat scratch load/stores Differential Revision: https://reviews.llvm.org/D91384 2020-11-13 05:04:33 +08:00			`; FLATSCR-NEXT: s_waitcnt vmcnt(2)`
			`; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32`
			`; FLATSCR-NEXT: s_waitcnt vmcnt(2)`
			`; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16`
			`; FLATSCR-NEXT: s_waitcnt vmcnt(2)`
			`; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32`
[AMDGPU] Use flat scratch instructions where available The support is disabled by default. So far there is instruction selection, spilling, and frame elimination. It also changes SP from unswizzled to swizzled as used by flat scratch instructions, so it cannot be mixed with MUBUF stack access. At the very least missing: - GlobalISel; - Some optimizations in frame elimination in between vector and scalar ALU; - It shall finally allow to always materialize frame index as an SGPR, but that is not implemented and frame elimination cannot handle it yet; - Unaligned and/or multidword flat scratch shall work, but it is legalized now for MUBUF; - Operand folding cannot optimize FI like with MUBUF yet; - It will need scaling the value of the SP/FP in the DWARF expression to recover the unswizzled scratch address; Differential Revision: https://reviews.llvm.org/D89170 2020-10-22 05:27:03 +08:00			`; FLATSCR-NEXT: s_waitcnt vmcnt(0)`
			`; FLATSCR-NEXT: s_setpc_b64 s[30:31]`
DAG: Don't pass 0 alignment value to allowsMisalignedMemoryAccesses I think not unconditionally passing getDstAlign is broken, but leave that for another change. 2020-08-13 08:02:05 +08:00			`%alloca = alloca [40 x i8], addrspace(5)`
			`%cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)*`
			`call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false)`
			`ret void`
			`}`

			`declare void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* noalias nocapture writeonly, i8 addrspace(1)* noalias nocapture readonly, i64, i1 immarg) #0`

			`attributes #0 = { argmemonly nounwind willreturn }`