llvm-project/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll

; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=GFX10HSA -check-prefix=ALL %s

; FIXME: align on alloca seems to be ignored for private_segment_alignment

; ALL-LABEL: {{^}}large_alloca_compute_shader:

; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000
; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000
; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000


; GCNHSA: .amd_kernel_code_t

; GCNHSA: enable_sgpr_private_segment_wave_byte_offset = 1
; GCNHSA: user_sgpr_count = 8
; GCNHSA: enable_sgpr_workgroup_id_x = 1
; GCNHSA: enable_sgpr_workgroup_id_y = 0
; GCNHSA: enable_sgpr_workgroup_id_z = 0
; GCNHSA: enable_sgpr_workgroup_info = 0
; GCNHSA: enable_vgpr_workitem_id = 0

; GCNHSA: enable_sgpr_private_segment_buffer = 1
; GCNHSA: enable_sgpr_dispatch_ptr = 0
; GCNHSA: enable_sgpr_queue_ptr = 0
; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
; GCNHSA: enable_sgpr_dispatch_id = 0
; GCNHSA: enable_sgpr_flat_scratch_init = 1
; GCNHSA: enable_sgpr_private_segment_size = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0
; GCNHSA: workitem_private_segment_byte_size = 32772
; GCNHSA: private_segment_alignment = 4
; GCNHSA: .end_amd_kernel_code_t

; GFX10HSA: s_add_u32 [[FLAT_SCR_LO:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX10HSA-DAG: s_addc_u32 [[FLAT_SCR_HI:s[0-9]+]], s{{[0-9]+}}, 0
; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]]
; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]]

; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen

; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
; ALL: ; ScratchSize: 32772
define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
  %large = alloca [8192 x i32], align 4, addrspace(5)
  %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191
  store volatile i32 %x, i32 addrspace(5)* %gep
  %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y
  %val = load volatile i32, i32 addrspace(5)* %gep1
  store volatile i32 %val, i32 addrspace(1)* undef
  ret void
}

attributes #0 = { nounwind  }
AMDGPU/SI: Don't use fixup_si_rodata for scratch rsrc relocations Summary: We need to set the fixup type to FK_Data_4 for the SCRATCH_RSRC_DWORD[01] symbols, since these require absolute relocations, and fixup_si_rodata is for relative relocations. Reviewers: arsenm, kzhuravl Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: http://reviews.llvm.org/D21153 llvm-svn: 272417 2016-06-11 03:26:38 +08:00			`; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s \| FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s`
			`; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s`
AMDGPU: Buffer descriptor changes for GFX9 Reviewers: arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye, dstuttard, tpr Differential Revision: https://reviews.llvm.org/D31158 llvm-svn: 298397 2017-03-22 01:00:39 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s`
AMDGPU: Enable code object v3 for AMDHSA only Differential Revision: https://reviews.llvm.org/D54186 llvm-svn: 346923 2018-11-15 10:32:43 +08:00			`; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 < %s -mattr=-flat-for-global \| FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s`
			`; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s \| FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s`
[AMDGPU] gfx1010 tests. NFC. llvm-svn: 360615 2019-05-14 03:30:06 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s \| FileCheck -check-prefix=GCNHSA -check-prefix=GFX10HSA -check-prefix=ALL %s`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00
			`; FIXME: align on alloca seems to be ignored for private_segment_alignment`

			`; ALL-LABEL: {{^}}large_alloca_compute_shader:`

AMDGPU: Change private_element_size to 4 llvm-svn: 269145 2016-05-11 08:28:54 +08:00			`; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0`
AMDGPU: Emit R_AMDGPU_ABS32_{HI,LO} for scratch buffer relocations Reviewers: arsenm, rafael, kzhuravl Subscribers: rafael, arsenm, llvm-commits, kzhuravl Differential Revision: http://reviews.llvm.org/D21400 llvm-svn: 273166 2016-06-21 00:59:44 +08:00			`; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD0`
AMDGPU: Change private_element_size to 4 llvm-svn: 269145 2016-05-11 08:28:54 +08:00			`; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1`
AMDGPU: Emit R_AMDGPU_ABS32_{HI,LO} for scratch buffer relocations Reviewers: arsenm, rafael, kzhuravl Subscribers: rafael, arsenm, llvm-commits, kzhuravl Differential Revision: http://reviews.llvm.org/D21400 llvm-svn: 273166 2016-06-21 00:59:44 +08:00			`; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD1`
AMDGPU: Change private_element_size to 4 llvm-svn: 269145 2016-05-11 08:28:54 +08:00			`; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1`
AMDGPU/SI: Set INDEX_STRIDE for scratch coalescing Summary: Mesa and other users must set this to enable coalescing: - STRIDE = 0 - SWIZZLE_ENABLE = 1 This makes one particular compute shader 8x faster. Reviewers: tstellarAMD, arsenm Subscribers: arsenm, kzhuravl Differential Revision: http://reviews.llvm.org/D21136 llvm-svn: 272556 2016-06-14 00:05:57 +08:00			`; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000`
			`; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000`
AMDGPU: Buffer descriptor changes for GFX9 Reviewers: arsenm Subscribers: qcolombet, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye, dstuttard, tpr Differential Revision: https://reviews.llvm.org/D31158 llvm-svn: 298397 2017-03-22 01:00:39 +08:00			`; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00

			`; GCNHSA: .amd_kernel_code_t`
AMDGPU: Rework how private buffer passed for HSA If we know we have stack objects, we reserve the registers that the private buffer resource and wave offset are passed and use them directly. If not, reserve the last 5 SGPRs just in case we need to spill. After register allocation, try to pick the next available registers instead of the last SGPRs, and then insert copies from the inputs to the reserved registers in the progloue. This also only selectively enables all of the input registers which are really required instead of always enabling them. llvm-svn: 254331 2015-12-01 05:16:03 +08:00
[AMDGPU] Assembler: rename amd_kernel_code_t asm names according to spec Summary: Also removed duplicate code from AMDGPUTargetAsmStreamer. This change only change how amd_kernel_code_t is parsed and printed. No variable names are changed. Reviewers: vpykhtin, tstellarAMD Subscribers: arsenm, wdng, nhaehnle Differential Revision: https://reviews.llvm.org/D24296 llvm-svn: 281028 2016-09-09 18:08:02 +08:00			`; GCNHSA: enable_sgpr_private_segment_wave_byte_offset = 1`
			`; GCNHSA: user_sgpr_count = 8`
			`; GCNHSA: enable_sgpr_workgroup_id_x = 1`
			`; GCNHSA: enable_sgpr_workgroup_id_y = 0`
			`; GCNHSA: enable_sgpr_workgroup_id_z = 0`
			`; GCNHSA: enable_sgpr_workgroup_info = 0`
			`; GCNHSA: enable_vgpr_workitem_id = 0`
AMDGPU: Rework how private buffer passed for HSA If we know we have stack objects, we reserve the registers that the private buffer resource and wave offset are passed and use them directly. If not, reserve the last 5 SGPRs just in case we need to spill. After register allocation, try to pick the next available registers instead of the last SGPRs, and then insert copies from the inputs to the reserved registers in the progloue. This also only selectively enables all of the input registers which are really required instead of always enabling them. llvm-svn: 254331 2015-12-01 05:16:03 +08:00
			`; GCNHSA: enable_sgpr_private_segment_buffer = 1`
			`; GCNHSA: enable_sgpr_dispatch_ptr = 0`
			`; GCNHSA: enable_sgpr_queue_ptr = 0`
			`; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1`
			`; GCNHSA: enable_sgpr_dispatch_id = 0`
AMDGPU: Set flat_scratch from flat_scratch_init reg This was hardcoded to the static private size, but this would be missing the offset and additional size for someday when we have dynamic sizing. Also stops always initializing flat_scratch even when unused. In the future we should stop emitting this unless flat instructions are used to access private memory. For example this will initialize it almost always on VI because flat is used for global access. llvm-svn: 260658 2016-02-12 14:31:30 +08:00			`; GCNHSA: enable_sgpr_flat_scratch_init = 1`
AMDGPU: Rework how private buffer passed for HSA If we know we have stack objects, we reserve the registers that the private buffer resource and wave offset are passed and use them directly. If not, reserve the last 5 SGPRs just in case we need to spill. After register allocation, try to pick the next available registers instead of the last SGPRs, and then insert copies from the inputs to the reserved registers in the progloue. This also only selectively enables all of the input registers which are really required instead of always enabling them. llvm-svn: 254331 2015-12-01 05:16:03 +08:00			`; GCNHSA: enable_sgpr_private_segment_size = 0`
			`; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0`
			`; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0`
			`; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0`
AMDGPU/SI: Set the code objects private segment size when targeting HSA. Summary: I'm not sure how things worked before without this. Reviewers: arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15492 llvm-svn: 255692 2015-12-16 06:55:30 +08:00			`; GCNHSA: workitem_private_segment_byte_size = 32772`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00			`; GCNHSA: private_segment_alignment = 4`
			`; GCNHSA: .end_amd_kernel_code_t`

[AMDGPU] gfx1010 tests. NFC. llvm-svn: 360615 2019-05-14 03:30:06 +08:00			`; GFX10HSA: s_add_u32 [[FLAT_SCR_LO:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}`
			`; GFX10HSA-DAG: s_addc_u32 [[FLAT_SCR_HI:s[0-9]+]], s{{[0-9]+}}, 0`
			`; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]]`
			`; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]]`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00
AMDGPU: Set flat_scratch from flat_scratch_init reg This was hardcoded to the static private size, but this would be missing the offset and additional size for someday when we have dynamic sizing. Also stops always initializing flat_scratch even when unused. In the future we should stop emitting this unless flat instructions are used to access private memory. For example this will initialize it almost always on VI because flat is used for global access. llvm-svn: 260658 2016-02-12 14:31:30 +08:00			`; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen`
			`; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00
[AMDGPU] Switch to the new addr space mapping by default This requires corresponding clang change. Differential Revision: https://reviews.llvm.org/D40955 llvm-svn: 324101 2018-02-03 00:07:16 +08:00			`; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00			`; ALL: ; ScratchSize: 32772`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {`
[AMDGPU] Switch to the new addr space mapping by default This requires corresponding clang change. Differential Revision: https://reviews.llvm.org/D40955 llvm-svn: 324101 2018-02-03 00:07:16 +08:00			`%large = alloca [8192 x i32], align 4, addrspace(5)`
			`%gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191`
			`store volatile i32 %x, i32 addrspace(5)* %gep`
			`%gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y`
			`%val = load volatile i32, i32 addrspace(5)* %gep1`
AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. llvm-svn: 254329 2015-12-01 05:15:53 +08:00			`store volatile i32 %val, i32 addrspace(1)* undef`
			`ret void`
			`}`

			`attributes #0 = { nounwind }`