llvm-project/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1122 lines
53 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; This testcase would fail on GFX908 due to not having a free VGPR available to
; copy between AGPRs.
define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v32, v1
; GFX908-NEXT: v_mov_b32_e32 v33, v0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v[0:31] a[0:15]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v34, a15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a31, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a14
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a30, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a13
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a29, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a28, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a27, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a10
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a26, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a9
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a25, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a24, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a23, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a6
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a5
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a21, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a20, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a19, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a18, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a1
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a17, v34
; GFX908-NEXT: v_accvgpr_read_b32 v34, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v34
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v34, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v36, a13 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 ; Reload Reuse
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a2 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a3 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a4 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a5 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a6 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a7 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a8 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a9 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a10 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a15 ; Reload Reuse
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v34, a1
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v34
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a1, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a2, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a3, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a4, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a5, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a6, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a7, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a8, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a9, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a10, v34 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a14, v35 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a15, v34 ; Reload Reuse
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v32, a2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a3, v32
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
[AMDGPU] Move call clobbered return address registers s[30:31] to callee saved range Currently the return address ABI registers s[30:31], which fall in the call clobbered register range, are added as a live-in on the function entry to preserve its value when we have calls so that it gets saved and restored around the calls. But the DWARF unwind information (CFI) needs to track where the return address resides in a frame and the above approach makes it difficult to track the return address when the CFI information is emitted during the frame lowering, due to the involvment of understanding the control flow. This patch moves the return address ABI registers s[30:31] into callee saved registers range and stops adding live-in for return address registers, so that the CFI machinery will know where the return address resides when CSR save/restore happen during the frame lowering. And doing the above poses an issue that now the return instruction uses undefined register `sgpr30_sgpr31`. This is resolved by hiding the return address register use by the return instruction through the `SI_RETURN` pseudo instruction, which doesn't take any input operands, until the `SI_RETURN` pseudo gets lowered to the `S_SETPC_B64_return` during the `expandPostRAPseudo()`. As an added benefit, this patch simplifies overall return instruction handling. Note: The AMDGPU CFI changes are there only in the downstream code and another version of this patch will be posted for review for the downstream code. Reviewed By: arsenm, ronlieb Differential Revision: https://reviews.llvm.org/D114652
2022-03-08 03:39:18 +08:00
; GFX90A-NEXT: v_mov_b32_e32 v33, v0
; GFX90A-NEXT: v_mov_b32_e32 v32, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31] a[0:15]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a15
; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a14
; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a13
; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a12
; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a11
; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a10
; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a9
; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a8
; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a7
; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a6
; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a5
; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a4
; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a3
; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a2
; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT: s_nop 1
[AMDGPU] Move call clobbered return address registers s[30:31] to callee saved range Currently the return address ABI registers s[30:31], which fall in the call clobbered register range, are added as a live-in on the function entry to preserve its value when we have calls so that it gets saved and restored around the calls. But the DWARF unwind information (CFI) needs to track where the return address resides in a frame and the above approach makes it difficult to track the return address when the CFI information is emitted during the frame lowering, due to the involvment of understanding the control flow. This patch moves the return address ABI registers s[30:31] into callee saved registers range and stops adding live-in for return address registers, so that the CFI machinery will know where the return address resides when CSR save/restore happen during the frame lowering. And doing the above poses an issue that now the return instruction uses undefined register `sgpr30_sgpr31`. This is resolved by hiding the return address register use by the return instruction through the `SI_RETURN` pseudo instruction, which doesn't take any input operands, until the `SI_RETURN` pseudo gets lowered to the `S_SETPC_B64_return` during the `expandPostRAPseudo()`. As an added benefit, this patch simplifies overall return instruction handling. Note: The AMDGPU CFI changes are there only in the downstream code and another version of this patch will be posted for review for the downstream code. Reviewed By: arsenm, ronlieb Differential Revision: https://reviews.llvm.org/D114652
2022-03-08 03:39:18 +08:00
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_read_b32 v39, a10 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v36, a13 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v35, a14 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a15 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a1
; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v39 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v35 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v34 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a3 v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
%asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"()
%vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
%agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
%mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
%agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
%agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
ret void
}
; Check that we do make use of v32 if there are no AGPRs present in the function
define amdgpu_kernel void @no_agpr_no_reserve(<32 x i32> addrspace(1)* %arg) #0 {
; GFX908-LABEL: no_agpr_no_reserve:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX908-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX908-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX908-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
; GFX908-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX908-NEXT: s_waitcnt vmcnt(7)
; GFX908-NEXT: v_add_u32_e32 v4, v4, v4
; GFX908-NEXT: v_add_u32_e32 v3, v3, v3
; GFX908-NEXT: v_add_u32_e32 v2, v2, v2
; GFX908-NEXT: v_add_u32_e32 v1, v1, v1
; GFX908-NEXT: s_waitcnt vmcnt(6)
; GFX908-NEXT: v_add_u32_e32 v8, v8, v8
; GFX908-NEXT: v_add_u32_e32 v7, v7, v7
; GFX908-NEXT: v_add_u32_e32 v6, v6, v6
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v32, v32, v32
; GFX908-NEXT: v_add_u32_e32 v31, v31, v31
; GFX908-NEXT: v_add_u32_e32 v30, v30, v30
; GFX908-NEXT: v_add_u32_e32 v29, v29, v29
; GFX908-NEXT: v_add_u32_e32 v5, v5, v5
; GFX908-NEXT: v_add_u32_e32 v12, v12, v12
; GFX908-NEXT: v_add_u32_e32 v11, v11, v11
; GFX908-NEXT: v_add_u32_e32 v10, v10, v10
; GFX908-NEXT: v_add_u32_e32 v9, v9, v9
; GFX908-NEXT: v_add_u32_e32 v16, v16, v16
; GFX908-NEXT: v_add_u32_e32 v15, v15, v15
; GFX908-NEXT: v_add_u32_e32 v14, v14, v14
; GFX908-NEXT: v_add_u32_e32 v13, v13, v13
; GFX908-NEXT: v_add_u32_e32 v20, v20, v20
; GFX908-NEXT: v_add_u32_e32 v19, v19, v19
; GFX908-NEXT: v_add_u32_e32 v18, v18, v18
; GFX908-NEXT: v_add_u32_e32 v17, v17, v17
; GFX908-NEXT: v_add_u32_e32 v24, v24, v24
; GFX908-NEXT: v_add_u32_e32 v23, v23, v23
; GFX908-NEXT: v_add_u32_e32 v22, v22, v22
; GFX908-NEXT: v_add_u32_e32 v21, v21, v21
; GFX908-NEXT: v_add_u32_e32 v28, v28, v28
; GFX908-NEXT: v_add_u32_e32 v27, v27, v27
; GFX908-NEXT: v_add_u32_e32 v26, v26, v26
; GFX908-NEXT: v_add_u32_e32 v25, v25, v25
; GFX908-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
; GFX908-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
; GFX908-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
; GFX908-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
; GFX908-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: no_agpr_no_reserve:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; GFX90A-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
; GFX90A-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
; GFX90A-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
; GFX90A-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
; GFX90A-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
; GFX90A-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; GFX90A-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: v_add_u32_e32 v3, v3, v3
; GFX90A-NEXT: v_add_u32_e32 v2, v2, v2
; GFX90A-NEXT: v_add_u32_e32 v1, v1, v1
; GFX90A-NEXT: v_add_u32_e32 v0, v0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(6)
; GFX90A-NEXT: v_add_u32_e32 v7, v7, v7
; GFX90A-NEXT: v_add_u32_e32 v6, v6, v6
; GFX90A-NEXT: v_add_u32_e32 v5, v5, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v31, v31, v31
; GFX90A-NEXT: v_add_u32_e32 v30, v30, v30
; GFX90A-NEXT: v_add_u32_e32 v29, v29, v29
; GFX90A-NEXT: v_add_u32_e32 v28, v28, v28
; GFX90A-NEXT: v_add_u32_e32 v4, v4, v4
; GFX90A-NEXT: v_add_u32_e32 v11, v11, v11
; GFX90A-NEXT: v_add_u32_e32 v10, v10, v10
; GFX90A-NEXT: v_add_u32_e32 v9, v9, v9
; GFX90A-NEXT: v_add_u32_e32 v8, v8, v8
; GFX90A-NEXT: v_add_u32_e32 v15, v15, v15
; GFX90A-NEXT: v_add_u32_e32 v14, v14, v14
; GFX90A-NEXT: v_add_u32_e32 v13, v13, v13
; GFX90A-NEXT: v_add_u32_e32 v12, v12, v12
; GFX90A-NEXT: v_add_u32_e32 v19, v19, v19
; GFX90A-NEXT: v_add_u32_e32 v18, v18, v18
; GFX90A-NEXT: v_add_u32_e32 v17, v17, v17
; GFX90A-NEXT: v_add_u32_e32 v16, v16, v16
; GFX90A-NEXT: v_add_u32_e32 v23, v23, v23
; GFX90A-NEXT: v_add_u32_e32 v22, v22, v22
; GFX90A-NEXT: v_add_u32_e32 v21, v21, v21
; GFX90A-NEXT: v_add_u32_e32 v20, v20, v20
; GFX90A-NEXT: v_add_u32_e32 v27, v27, v27
; GFX90A-NEXT: v_add_u32_e32 v26, v26, v26
; GFX90A-NEXT: v_add_u32_e32 v25, v25, v25
; GFX90A-NEXT: v_add_u32_e32 v24, v24, v24
; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX90A-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %id
%load = load <32 x i32>, <32 x i32> addrspace(1)* %gep
%add = add <32 x i32> %load, %load
store <32 x i32> %add, <32 x i32> addrspace(1)* %gep
ret void
}
; FIXME: This case is broken. The asm value passed in v32 is live
; through the range where the reserved def for the copy is introduced,
; clobbering the user value.
define void @v32_asm_def_use(float %v0, float %v1) #0 {
; GFX908-LABEL: v32_asm_def_use:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v33, v1
; GFX908-NEXT: v_mov_b32_e32 v34, v0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v[0:31] a[0:15]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v35, a15
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v32
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a31, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a14
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a30, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a13
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a29, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a28, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a27, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a10
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a26, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a9
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a25, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a24, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a23, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a6
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a5
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a21, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a20, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a19, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a1
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a17, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v35
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v35, a1
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_write_b32 a32, v35
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: v_accvgpr_read_b32 v33, a2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use v32
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v32_asm_def_use:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v34, v0
; GFX90A-NEXT: v_mov_b32_e32 v33, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31] a[0:15]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a15
; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a14
; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a13
; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a12
; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a11
; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a10
; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a9
; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a8
; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a7
; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a6
; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a5
; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a4
; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a3
; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a2
; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v32
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v35, a32 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a3 v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v32
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
%asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"()
%vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
%agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
%mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
%v32 = call i32 asm sideeffect "; def $0","=${v32}"()
%agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
%agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
call void asm sideeffect "; use $0","${v32}"(i32 %v32)
ret void
}
define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 {
; GFX908-LABEL: introduced_copy_to_sgpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
; GFX908-NEXT: s_load_dword s8, s[4:5], 0x18
; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
; GFX908-NEXT: s_sub_i32 s4, 0, s1
; GFX908-NEXT: s_lshr_b32 s11, s8, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s8
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX908-NEXT: s_lshl_b64 s[8:9], s[2:3], 5
; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s11
; GFX908-NEXT: s_or_b32 s8, s8, 28
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX908-NEXT: v_mov_b32_e32 v7, s3
; GFX908-NEXT: s_mov_b32 s10, 0
; GFX908-NEXT: v_mov_b32_e32 v6, s2
; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0
; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5
; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX908-NEXT: v_add_u32_e32 v0, v0, v2
; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX908-NEXT: v_mov_b32_e32 v2, s8
; GFX908-NEXT: v_mov_b32_e32 v3, s9
; GFX908-NEXT: v_mul_lo_u32 v4, v0, s1
; GFX908-NEXT: v_add_u32_e32 v5, 1, v0
; GFX908-NEXT: v_sub_u32_e32 v4, s0, v4
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4
; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX908-NEXT: v_subrev_u32_e32 v5, s1, v4
; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; GFX908-NEXT: v_add_u32_e32 v5, 1, v0
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4
; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX908-NEXT: v_lshlrev_b64 v[4:5], 5, v[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s0, v24
; GFX908-NEXT: s_and_b32 s0, 0xffff, s0
; GFX908-NEXT: s_mul_i32 s1, s7, s0
; GFX908-NEXT: s_mul_hi_u32 s7, s6, s0
; GFX908-NEXT: s_mul_i32 s0, s6, s0
; GFX908-NEXT: s_add_i32 s1, s7, s1
; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5
; GFX908-NEXT: s_branch .LBB3_2
; GFX908-NEXT: .LBB3_1: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX908-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GFX908-NEXT: .LBB3_2: ; %bb9
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
; GFX908-NEXT: s_cbranch_scc0 .LBB3_1
; GFX908-NEXT: ; %bb.3: ; %bb14
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: v_mov_b32_e32 v8, 0
; GFX908-NEXT: v_mov_b32_e32 v9, 0
; GFX908-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
; GFX908-NEXT: s_mov_b32 s11, s10
; GFX908-NEXT: v_mov_b32_e32 v13, s11
; GFX908-NEXT: v_mov_b32_e32 v15, s11
; GFX908-NEXT: v_mov_b32_e32 v17, s11
; GFX908-NEXT: v_mov_b32_e32 v12, s10
; GFX908-NEXT: v_mov_b32_e32 v14, s10
; GFX908-NEXT: v_mov_b32_e32 v16, s10
; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[6:7]
; GFX908-NEXT: v_mov_b32_e32 v11, v3
; GFX908-NEXT: v_mov_b32_e32 v19, v13
; GFX908-NEXT: v_mov_b32_e32 v10, v2
; GFX908-NEXT: v_mov_b32_e32 v18, v12
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s2, v8
; GFX908-NEXT: v_readfirstlane_b32 s3, v9
; GFX908-NEXT: s_add_u32 s2, s2, 1
; GFX908-NEXT: s_addc_u32 s3, s3, 0
; GFX908-NEXT: s_mul_hi_u32 s9, s4, s2
; GFX908-NEXT: s_mul_i32 s11, s5, s2
; GFX908-NEXT: s_mul_i32 s8, s4, s2
; GFX908-NEXT: s_mul_i32 s2, s4, s3
; GFX908-NEXT: s_add_i32 s2, s9, s2
; GFX908-NEXT: s_add_i32 s9, s2, s11
; GFX908-NEXT: s_branch .LBB3_5
; GFX908-NEXT: .LBB3_4: ; %bb58
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9]
; GFX908-NEXT: v_mov_b32_e32 v20, s7
; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s6, v10
; GFX908-NEXT: v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: .LBB3_5: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_mov_b32_e32 v21, s9
; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s8, v10
; GFX908-NEXT: v_addc_co_u32_e32 v21, vcc, v11, v21, vcc
; GFX908-NEXT: global_load_dword v28, v[20:21], off offset:-12 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_load_dword v27, v[20:21], off offset:-8 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_load_dword v22, v[20:21], off offset:-4 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_load_dword v20, v[20:21], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: ds_read_b64 v[20:21], v1
; GFX908-NEXT: ds_read_b64 v[22:23], v0
; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_cbranch_vccnz .LBB3_4
; GFX908-NEXT: ; %bb.6: ; %bb51
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: v_cvt_f32_f16_sdwa v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX908-NEXT: v_cvt_f32_f16_e32 v28, v28
; GFX908-NEXT: v_cvt_f32_f16_sdwa v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX908-NEXT: v_cvt_f32_f16_e32 v27, v27
; GFX908-NEXT: v_add_f32_e32 v31, v25, v20
; GFX908-NEXT: v_add_f32_e32 v32, v26, v21
; GFX908-NEXT: v_add_f32_e32 v33, 0, v20
; GFX908-NEXT: v_add_f32_e32 v34, 0, v21
; GFX908-NEXT: v_add_f32_e32 v23, v29, v23
; GFX908-NEXT: v_add_f32_e32 v22, v28, v22
; GFX908-NEXT: v_add_f32_e32 v21, v30, v21
; GFX908-NEXT: v_add_f32_e32 v20, v27, v20
; GFX908-NEXT: v_add_f32_e32 v13, v13, v32
; GFX908-NEXT: v_add_f32_e32 v12, v12, v31
; GFX908-NEXT: v_add_f32_e32 v15, v15, v34
; GFX908-NEXT: v_add_f32_e32 v14, v14, v33
; GFX908-NEXT: v_add_f32_e32 v16, v16, v22
; GFX908-NEXT: v_add_f32_e32 v17, v17, v23
; GFX908-NEXT: v_add_f32_e32 v18, v18, v20
; GFX908-NEXT: v_add_f32_e32 v19, v19, v21
; GFX908-NEXT: s_branch .LBB3_4
;
; GFX90A-LABEL: introduced_copy_to_sgpr:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x18
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_mov_b32 s8, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX90A-NEXT: s_sub_i32 s9, 0, s7
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[10:11], 5
; GFX90A-NEXT: s_or_b32 s4, s4, 28
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX90A-NEXT: s_lshr_b32 s12, s2, 16
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s2
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s12
; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[10:11], s[10:11] op_sel:[0,1]
; GFX90A-NEXT: v_mul_lo_u32 v8, s9, v0
; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8
; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8
; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s7
; GFX90A-NEXT: v_sub_u32_e32 v8, s6, v8
; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0
; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX90A-NEXT: v_subrev_u32_e32 v9, s7, v8
; GFX90A-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0
; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s4, v28
; GFX90A-NEXT: s_and_b32 s4, 0xffff, s4
; GFX90A-NEXT: s_mul_i32 s1, s1, s4
; GFX90A-NEXT: s_mul_hi_u32 s5, s0, s4
; GFX90A-NEXT: s_mul_i32 s0, s0, s4
; GFX90A-NEXT: s_add_i32 s1, s5, s1
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5
; GFX90A-NEXT: s_branch .LBB3_2
; GFX90A-NEXT: .LBB3_1: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v9, vcc
; GFX90A-NEXT: .LBB3_2: ; %bb9
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2
; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off
; GFX90A-NEXT: s_mov_b32 s9, s8
; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s6, v12
; GFX90A-NEXT: v_readfirstlane_b32 s7, v13
; GFX90A-NEXT: s_add_u32 s6, s6, 1
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s6
; GFX90A-NEXT: s_mul_i32 s7, s2, s7
; GFX90A-NEXT: s_mul_i32 s10, s3, s6
; GFX90A-NEXT: s_add_i32 s7, s9, s7
; GFX90A-NEXT: s_mul_i32 s6, s2, s6
; GFX90A-NEXT: s_add_i32 s7, s7, s10
; GFX90A-NEXT: s_branch .LBB3_5
; GFX90A-NEXT: .LBB3_4: ; %bb58
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
; GFX90A-NEXT: v_mov_b32_e32 v24, s5
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s4, v14
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: .LBB3_5: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_mov_b32_e32 v25, s7
; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s6, v14
; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc
; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_load_dword v29, v[24:25], off offset:-8 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_load_dword v26, v[24:25], off offset:-4 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_load_dword v26, v[24:25], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ; kill: killed $vgpr24 killed $vgpr25
; GFX90A-NEXT: ds_read_b64 v[24:25], v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ds_read_b64 v[26:27], v0
; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4
; GFX90A-NEXT: ; %bb.6: ; %bb51
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: v_cvt_f32_f16_sdwa v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_cvt_f32_f16_e32 v30, v30
; GFX90A-NEXT: v_cvt_f32_f16_sdwa v33, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_cvt_f32_f16_e32 v32, v29
; GFX90A-NEXT: v_pk_add_f32 v[34:35], v[2:3], v[24:25]
; GFX90A-NEXT: v_pk_add_f32 v[36:37], v[24:25], 0 op_sel_hi:[1,0]
; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[30:31], v[26:27]
; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[32:33], v[24:25]
; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35]
; GFX90A-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[36:37]
; GFX90A-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[26:27]
; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[24:25]
; GFX90A-NEXT: s_branch .LBB3_4
bb:
%i = load volatile i16, i16 addrspace(4)* undef, align 2
%i6 = zext i16 %i to i64
%i7 = udiv i32 %arg1, %arg2
%i8 = zext i32 %i7 to i64
br label %bb9
bb9: ; preds = %bb12, %bb
%i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
%i11 = icmp slt i64 %i10, 0
br i1 undef, label %bb14, label %bb12
bb12: ; preds = %bb58, %bb9
%i13 = add nuw nsw i64 %i10, %i8
br label %bb9
bb14: ; preds = %bb9
%i15 = load i64, i64 addrspace(1)* null, align 8
br label %bb16
bb16: ; preds = %bb58, %bb14
%i17 = phi i64 [ %i65, %bb58 ], [ %i15, %bb14 ]
%i18 = phi <2 x float> [ %i59, %bb58 ], [ zeroinitializer, %bb14 ]
%i19 = phi <2 x float> [ %i60, %bb58 ], [ zeroinitializer, %bb14 ]
%i20 = phi <2 x float> [ %i61, %bb58 ], [ zeroinitializer, %bb14 ]
%i21 = phi <2 x float> [ %i62, %bb58 ], [ zeroinitializer, %bb14 ]
%i22 = add nsw i64 %i17, 1
%i23 = mul nsw i64 %i22, %arg
%i24 = add nsw i64 %i23, %i10
%i25 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 8
%i26 = bitcast half addrspace(1)* %i25 to <2 x half> addrspace(1)*
%i27 = load volatile <2 x half>, <2 x half> addrspace(1)* %i26, align 16
%i28 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 10
%i29 = bitcast half addrspace(1)* %i28 to <2 x half> addrspace(1)*
%i30 = load volatile <2 x half>, <2 x half> addrspace(1)* %i29, align 4
%i31 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 12
%i32 = bitcast half addrspace(1)* %i31 to <2 x half> addrspace(1)*
%i33 = load volatile <2 x half>, <2 x half> addrspace(1)* %i32, align 8
%i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
%i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
%i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
%i37 = fpext <2 x half> %arg4 to <2 x float>
%i39 = fpext <2 x half> %i27 to <2 x float>
%i40 = fpext <2 x half> %i30 to <2 x float>
%i41 = fpext <2 x half> %i33 to <2 x float>
%i42 = fpext <2 x half> %i36 to <2 x float>
%i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
%i44 = fadd contract <2 x float> %i37, %i43
%i45 = fadd contract <2 x float> %i43, zeroinitializer
%i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
%i47 = fadd contract <2 x float> %i39, %i46
%i48 = fadd contract <2 x float> %i40, %i43
%i49 = fadd contract <2 x float> %i41, zeroinitializer
%i50 = fadd contract <2 x float> %i42, zeroinitializer
fence syncscope("workgroup") acquire
br i1 %i11, label %bb58, label %bb51
bb51: ; preds = %bb16
%i52 = fadd contract <2 x float> %i18, %i44
%i53 = fadd contract <2 x float> %i19, %i45
%i54 = fadd contract <2 x float> %i20, %i47
%i55 = fadd contract <2 x float> %i21, %i48
%i56 = fadd contract <2 x float> %i49, zeroinitializer
%i57 = fadd contract <2 x float> %i50, zeroinitializer
br label %bb58
bb58: ; preds = %bb51, %bb16
%i59 = phi <2 x float> [ %i18, %bb16 ], [ %i52, %bb51 ]
%i60 = phi <2 x float> [ %i19, %bb16 ], [ %i53, %bb51 ]
%i61 = phi <2 x float> [ %i20, %bb16 ], [ %i54, %bb51 ]
%i62 = phi <2 x float> [ %i21, %bb16 ], [ %i55, %bb51 ]
%i63 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i56, %bb51 ]
%i64 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i57, %bb51 ]
%i65 = add nsw i64 %i17, %i6
%i66 = icmp slt i64 %i65, 0
br i1 %i66, label %bb16, label %bb12
}
; This testcase would fail on GFX908 due to not having a free VGPR available to
; copy SGPR to AGPR.
define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v32, v1
; GFX908-NEXT: v_mov_b32_e32 v33, v0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v[0:31] s[0:15]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_mov_b32_e32 v34, s15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a31, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s14
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a30, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s13
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a29, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a28, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a27, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s10
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a26, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s9
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a25, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a24, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a23, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s6
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s5
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a21, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a20, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a19, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a18, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s1
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a17, v34
; GFX908-NEXT: v_mov_b32_e32 v34, s0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v34
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v34, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v36, a13 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 ; Reload Reuse
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a2 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a3 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a4 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a5 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a6 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a7 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a8 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a9 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a10 ; Reload Reuse
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX908-NEXT: v_accvgpr_read_b32 v34, a15 ; Reload Reuse
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v34, a1
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a32, v34
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a1, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a2, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a3, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a4, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a5, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a6, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a7, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a8, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a9, v34 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a10, v34 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a14, v35 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a15, v34 ; Reload Reuse
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v32, a2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a3, v32
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v33, v0
; GFX90A-NEXT: v_mov_b32_e32 v32, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31] s[0:15]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1
; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a3 v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v34 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
%asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"()
%vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
%agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
%mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
%agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
%agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
ret void
}
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
attributes #1 = { convergent nounwind readnone willreturn }
attributes #2 = { nounwind readnone willreturn }
attributes #3 = { "amdgpu-waves-per-eu"="7,7" }