llvm-project/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

declare void @extern_func()

define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
; preserved across the call and should get 8 scratch registers.

; GFX9-LABEL: non_preserved_vgpr_tuple8:
; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill

; GFX9: v_mov_b32_e32 v37, v11
; GFX9-NEXT: v_mov_b32_e32 v38, v10
; GFX9-NEXT: v_mov_b32_e32 v49, v9
; GFX9-NEXT: v_writelane_b32 v44, s30, 0
; GFX9-NEXT: v_mov_b32_e32 v36, v16
; GFX9-NEXT: v_mov_b32_e32 v35, v15
; GFX9-NEXT: v_mov_b32_e32 v34, v14
; GFX9-NEXT: v_mov_b32_e32 v33, v13
; GFX9-NEXT: v_mov_b32_e32 v32, v12

; GFX9: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND

; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]

; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9: s_setpc_b64 s[4:5]
;
; GFX10-LABEL: non_preserved_vgpr_tuple8:
; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill

; GFX10: v_mov_b32_e32 v36, v16
; GFX10-NEXT: v_mov_b32_e32 v35, v15
; GFX10-NEXT: v_mov_b32_e32 v34, v14
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12

; GFX10: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND

; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]

; GFX10: buffer_load_dword v43, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12

; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX10: s_setpc_b64 s[4:5]
main_body:
  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
  call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
  call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
  call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
  call void @extern_func()
  ret <4 x float> %v
}

define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
; across the call and should get allcoated to 8 CSRs.
; Only the lower 5 sub-registers of the tuple are preserved.
; The upper 3 sub-registers are unused.

; GFX9-LABEL: call_preserved_vgpr_tuple8:
; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill

; GFX9: v_mov_b32_e32 v44, v16
; GFX9-NEXT: v_mov_b32_e32 v43, v15
; GFX9-NEXT: v_mov_b32_e32 v42, v14
; GFX9-NEXT: v_mov_b32_e32 v41, v13
; GFX9-NEXT: v_mov_b32_e32 v40, v12

; GFX9: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1

; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload

; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9: s_setpc_b64 s[4:5]
;
; GFX10-LABEL: call_preserved_vgpr_tuple8:
; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill

; GFX10: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX10-NEXT: v_mov_b32_e32 v40, v16
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1
; GFX10-NEXT: v_mov_b32_e32 v41, v15
; GFX10-NEXT: v_mov_b32_e32 v42, v14
; GFX10-NEXT: v_mov_b32_e32 v43, v13
; GFX10-NEXT: v_mov_b32_e32 v44, v12
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1

; GFX10: buffer_load_dword v44, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16
; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20
; GFX10: s_setpc_b64 s[4:5]
main_body:
  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
  store <4 x float> %v, <4 x float> addrspace(1)* undef
  call void @extern_func()
  %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
  ret <4 x float> %v1
}

declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1

attributes #0 = { nounwind writeonly }
attributes #1 = { nounwind readonly }
[AMDGPU] Fixed the test by adding the triple. 2020-05-06 02:40:14 +08:00			`; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s`
			`; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX10 %s`
[AMDGPU] Introduce more scratch registers in the ABI. The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356 2020-03-27 15:46:51 +08:00
			`declare void @extern_func()`

			`define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {`
			`; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be`
			`; preserved across the call and should get 8 scratch registers.`

			`; GFX9-LABEL: non_preserved_vgpr_tuple8:`
			`; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill`
			`; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill`

			`; GFX9: v_mov_b32_e32 v37, v11`
			`; GFX9-NEXT: v_mov_b32_e32 v38, v10`
			`; GFX9-NEXT: v_mov_b32_e32 v49, v9`
			`; GFX9-NEXT: v_writelane_b32 v44, s30, 0`
			`; GFX9-NEXT: v_mov_b32_e32 v36, v16`
			`; GFX9-NEXT: v_mov_b32_e32 v35, v15`
			`; GFX9-NEXT: v_mov_b32_e32 v34, v14`
			`; GFX9-NEXT: v_mov_b32_e32 v33, v13`
			`; GFX9-NEXT: v_mov_b32_e32 v32, v12`

			`; GFX9: ;;#ASMSTART`
			`; GFX9-NEXT: ;;#ASMEND`

			`; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1`
			`; GFX9-NEXT: s_getpc_b64 s[4:5]`
			`; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4`
			`; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4`
			`; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0`
			`; GFX9-NEXT: s_waitcnt lgkmcnt(0)`
			`; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]`

			`; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload`
			`; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload`
			`; GFX9: s_setpc_b64 s[4:5]`
			`;`
			`; GFX10-LABEL: non_preserved_vgpr_tuple8:`
			`; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill`
			`; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill`

			`; GFX10: v_mov_b32_e32 v36, v16`
			`; GFX10-NEXT: v_mov_b32_e32 v35, v15`
			`; GFX10-NEXT: v_mov_b32_e32 v34, v14`
			`; GFX10-NEXT: v_mov_b32_e32 v33, v13`
			`; GFX10-NEXT: v_mov_b32_e32 v32, v12`

			`; GFX10: ;;#ASMSTART`
			`; GFX10-NEXT: ;;#ASMEND`

			`; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1`
[AMDGPU] Update VMEM scalar write hazard mitigation sequence Using s_waitcnt_depctr 0xffe3 is potentially faster than v_nop. Reviewed By: rampitec, foad Differential Revision: https://reviews.llvm.org/D83872 2020-07-16 10:07:26 +08:00			`; GFX10-NEXT: s_waitcnt_depctr 0xffe3`
[AMDGPU] Introduce more scratch registers in the ABI. The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356 2020-03-27 15:46:51 +08:00			`; GFX10-NEXT: s_getpc_b64 s[4:5]`
			`; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4`
			`; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4`
			`; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0`
			`; GFX10-NEXT: s_waitcnt lgkmcnt(0)`
			`; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]`

[AMDGPU] New SIInsertHardClauses pass Enable clausing of memory loads on gfx10 by adding a new pass to insert the s_clause instructions that mark the start of each hard clause. Differential Revision: https://reviews.llvm.org/D79792 2020-05-01 23:43:12 +08:00			`; GFX10: buffer_load_dword v43, off, s[0:3], s33`
			`; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4`
			`; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8`
			`; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12`
[AMDGPU] Introduce more scratch registers in the ABI. The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356 2020-03-27 15:46:51 +08:00
			`; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload`
			`; GFX10: s_setpc_b64 s[4:5]`
			`main_body:`
			`call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0`
			`call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0`
			`call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0`
			`call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0`
			`%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)`
			`call void @extern_func()`
			`ret <4 x float> %v`
			`}`

			`define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {`
			`; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved`
			`; across the call and should get allcoated to 8 CSRs.`
			`; Only the lower 5 sub-registers of the tuple are preserved.`
			`; The upper 3 sub-registers are unused.`

			`; GFX9-LABEL: call_preserved_vgpr_tuple8:`
			`; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill`
			`; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill`
			`; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill`

			`; GFX9: v_mov_b32_e32 v44, v16`
			`; GFX9-NEXT: v_mov_b32_e32 v43, v15`
			`; GFX9-NEXT: v_mov_b32_e32 v42, v14`
			`; GFX9-NEXT: v_mov_b32_e32 v41, v13`
			`; GFX9-NEXT: v_mov_b32_e32 v40, v12`

			`; GFX9: s_getpc_b64 s[4:5]`
			`; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4`
			`; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4`
			`; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0`
			`; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1`
			`; GFX9-NEXT: s_waitcnt vmcnt(0)`
			`; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off`
			`; GFX9-NEXT: s_waitcnt lgkmcnt(0)`
			`; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]`
			`; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1`

			`; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload`
			`; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload`

			`; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload`
			`; GFX9: s_setpc_b64 s[4:5]`
			`;`
			`; GFX10-LABEL: call_preserved_vgpr_tuple8:`
			`; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill`
			`; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill`
			`; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill`

			`; GFX10: s_getpc_b64 s[4:5]`
			`; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4`
			`; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4`
			`; GFX10-NEXT: v_mov_b32_e32 v40, v16`
[AMDGPU] More accurate gfx10 latencies Differential Revision: https://reviews.llvm.org/D81012 2020-03-11 19:33:30 +08:00			`; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0`
[AMDGPU] Introduce more scratch registers in the ABI. The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356 2020-03-27 15:46:51 +08:00			`; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1`
[AMDGPU] More accurate gfx10 latencies Differential Revision: https://reviews.llvm.org/D81012 2020-03-11 19:33:30 +08:00			`; GFX10-NEXT: v_mov_b32_e32 v41, v15`
[AMDGPU] Introduce more scratch registers in the ABI. The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356 2020-03-27 15:46:51 +08:00			`; GFX10-NEXT: v_mov_b32_e32 v42, v14`
			`; GFX10-NEXT: v_mov_b32_e32 v43, v13`
			`; GFX10-NEXT: v_mov_b32_e32 v44, v12`
			`; GFX10-NEXT: ; implicit-def: $vcc_hi`
			`; GFX10-NEXT: s_waitcnt vmcnt(0)`
			`; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off`
			`; GFX10-NEXT: s_waitcnt lgkmcnt(0)`
			`; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]`
			`; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1`

[AMDGPU] New SIInsertHardClauses pass Enable clausing of memory loads on gfx10 by adding a new pass to insert the s_clause instructions that mark the start of each hard clause. Differential Revision: https://reviews.llvm.org/D79792 2020-05-01 23:43:12 +08:00			`; GFX10: buffer_load_dword v44, off, s[0:3], s33`
			`; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4`
			`; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8`
			`; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12`
			`; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16`
			`; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20`
[AMDGPU] Introduce more scratch registers in the ABI. The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356 2020-03-27 15:46:51 +08:00			`; GFX10: s_setpc_b64 s[4:5]`
			`main_body:`
			`%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)`
			`store <4 x float> %v, <4 x float> addrspace(1)* undef`
			`call void @extern_func()`
			`%v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)`
			`ret <4 x float> %v1`
			`}`

			`declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1`

			`attributes #0 = { nounwind writeonly }`
			`attributes #1 = { nounwind readonly }`