llvm-project/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll

; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s

; If spilling to smem, additional registers are used for the resource
; descriptor.

; ALL-LABEL: {{^}}max_9_sgprs:

; ALL: SGPRBlocks: 1
; ALL: NumSGPRsForWavesPerEU: 9
define void @max_9_sgprs(i32 addrspace(1)* %out1,

                          i32 addrspace(1)* %out2,
                          i32 addrspace(1)* %out3,
                          i32 addrspace(1)* %out4,
                          i32 addrspace(1)* %out5,
                          i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
  store i32 %one, i32 addrspace(1)* %out1
  store i32 %two, i32 addrspace(1)* %out2
  store i32 %three, i32 addrspace(1)* %out3
  store i32 %four, i32 addrspace(1)* %out4
  store i32 %five, i32 addrspace(1)* %out5
  ret void
}

; private resource: 4
; scratch wave offset: 1
; workgroup ids: 3
; dispatch id: 2
; queue ptr: 2
; flat scratch init: 2
; ---------------------
; total: 14

; + reserved vcc = 16

; Because we can't handle re-using the last few input registers as the
; special vcc etc. registers (as well as decide to not use the unused
; features when the number of registers is frozen), this ends up using
; more than expected.

; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
; TOSGPR: SGPRBlocks: 1
; TOSGPR: NumSGPRsForWavesPerEU: 16

; TOSMEM: s_mov_b64 s[10:11], s[2:3]
; TOSMEM: s_mov_b64 s[8:9], s[0:1]
; TOSMEM: s_mov_b32 s7, s13

; TOSMEM: SGPRBlocks: 1
; TOSMEM: NumSGPRsForWavesPerEU: 16
define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                        i32 addrspace(1)* %out2,
                                        i32 addrspace(1)* %out3,
                                        i32 addrspace(1)* %out4,
                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
  store volatile i32 0, i32* undef
  br label %stores

stores:
  store volatile i32 %x.0, i32 addrspace(1)* undef
  store volatile i32 %x.0, i32 addrspace(1)* undef
  store volatile i32 %x.0, i32 addrspace(1)* undef
  store volatile i64 %x.3, i64 addrspace(1)* undef
  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef

  store i32 %one, i32 addrspace(1)* %out1
  store i32 %two, i32 addrspace(1)* %out2
  store i32 %three, i32 addrspace(1)* %out3
  store i32 %four, i32 addrspace(1)* %out4
  ret void
}

; The following test is commented out for now; http://llvm.org/PR31230
; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
; ; Make sure copies for input buffer are not clobbered. This requires
; ; swapping the order the registers are copied from what normally
; ; happens.

; XTOSMEM: s_mov_b32 s5, s11
; XTOSMEM: s_add_u32 m0, s5,
; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0

; XALL: SGPRBlocks: 2
; XALL: NumSGPRsForWavesPerEU: 18
;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
;                                        i32 addrspace(1)* %out2,
;                                        i32 addrspace(1)* %out3,
;                                        i32 addrspace(1)* %out4,
;                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
;  store volatile i32 0, i32* undef
;  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
;  store volatile i32 %x.0, i32 addrspace(1)* undef
;  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
;  store volatile i32 %x.0, i32 addrspace(1)* undef
;  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
;  store volatile i32 %x.0, i32 addrspace(1)* undef
;  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
;  store volatile i64 %x.3, i64 addrspace(1)* undef
;  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
;  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
;
;  store i32 %one, i32 addrspace(1)* %out1
;  store i32 %two, i32 addrspace(1)* %out2
;  store i32 %three, i32 addrspace(1)* %out3
;  store i32 %four, i32 addrspace(1)* %out4
;  ret void
;}

declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.y() #1
declare i32 @llvm.amdgcn.workgroup.id.z() #1
declare i64 @llvm.amdgcn.dispatch.id() #1
declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1

attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
AMDGPU/SI: Add back reverted SGPR spilling code, but disable it suggested as a better solution by Matt llvm-svn: 287942 2016-11-26 01:37:09 +08:00			`; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s`
			`; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s \| FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s`
[AMDGPU] Wave and register controls - Implemented amdgpu-flat-work-group-size attribute - Implemented amdgpu-num-active-waves-per-eu attribute - Implemented amdgpu-num-sgpr attribute - Implemented amdgpu-num-vgpr attribute - Dynamic LDS constraints are in a separate patch Patch by Tom Stellard and Konstantin Zhuravlyov Differential Revision: https://reviews.llvm.org/D21562 llvm-svn: 280747 2016-09-07 04:22:28 +08:00
AMDGPU/SI: Add back reverted SGPR spilling code, but disable it suggested as a better solution by Matt llvm-svn: 287942 2016-11-26 01:37:09 +08:00			`; If spilling to smem, additional registers are used for the resource`
			`; descriptor.`

[AMDGPU] Fix typo in GCNSchedStrategy Differential revision: https://reviews.llvm.org/D28980 llvm-svn: 293171 2017-01-26 18:51:47 +08:00			`; ALL-LABEL: {{^}}max_9_sgprs:`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00
AMDGPU/SI: Add back reverted SGPR spilling code, but disable it suggested as a better solution by Matt llvm-svn: 287942 2016-11-26 01:37:09 +08:00			`; ALL: SGPRBlocks: 1`
[AMDGPU] Fix typo in GCNSchedStrategy Differential revision: https://reviews.llvm.org/D28980 llvm-svn: 293171 2017-01-26 18:51:47 +08:00			`; ALL: NumSGPRsForWavesPerEU: 9`
			`define void @max_9_sgprs(i32 addrspace(1)* %out1,`
AMDGPU/SI: Add back reverted SGPR spilling code, but disable it suggested as a better solution by Matt llvm-svn: 287942 2016-11-26 01:37:09 +08:00
[AMDGPU] Wave and register controls - Implemented amdgpu-flat-work-group-size attribute - Implemented amdgpu-num-active-waves-per-eu attribute - Implemented amdgpu-num-sgpr attribute - Implemented amdgpu-num-vgpr attribute - Dynamic LDS constraints are in a separate patch Patch by Tom Stellard and Konstantin Zhuravlyov Differential Revision: https://reviews.llvm.org/D21562 llvm-svn: 280747 2016-09-07 04:22:28 +08:00			`i32 addrspace(1)* %out2,`
			`i32 addrspace(1)* %out3,`
			`i32 addrspace(1)* %out4,`
[AMDGPU] Revert failed scheduling This patch reverts region's scheduling to the original untouched state in case if we have have decreased occupancy. In addition it switches to use TargetRegisterInfo occupancy callback for pressure limits instead of gradually increasing limits which were just passed by. We are going to stay with the best schedule so we do not need to tolerate worsened scheduling anymore. Differential Revision: https://reviews.llvm.org/D29971 llvm-svn: 295206 2017-02-16 01:19:50 +08:00			`i32 addrspace(1)* %out5,`
			`i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {`
[AMDGPU] Wave and register controls - Implemented amdgpu-flat-work-group-size attribute - Implemented amdgpu-num-active-waves-per-eu attribute - Implemented amdgpu-num-sgpr attribute - Implemented amdgpu-num-vgpr attribute - Dynamic LDS constraints are in a separate patch Patch by Tom Stellard and Konstantin Zhuravlyov Differential Revision: https://reviews.llvm.org/D21562 llvm-svn: 280747 2016-09-07 04:22:28 +08:00			`store i32 %one, i32 addrspace(1)* %out1`
			`store i32 %two, i32 addrspace(1)* %out2`
			`store i32 %three, i32 addrspace(1)* %out3`
			`store i32 %four, i32 addrspace(1)* %out4`
[AMDGPU] Revert failed scheduling This patch reverts region's scheduling to the original untouched state in case if we have have decreased occupancy. In addition it switches to use TargetRegisterInfo occupancy callback for pressure limits instead of gradually increasing limits which were just passed by. We are going to stay with the best schedule so we do not need to tolerate worsened scheduling anymore. Differential Revision: https://reviews.llvm.org/D29971 llvm-svn: 295206 2017-02-16 01:19:50 +08:00			`store i32 %five, i32 addrspace(1)* %out5`
[AMDGPU] Wave and register controls - Implemented amdgpu-flat-work-group-size attribute - Implemented amdgpu-num-active-waves-per-eu attribute - Implemented amdgpu-num-sgpr attribute - Implemented amdgpu-num-vgpr attribute - Dynamic LDS constraints are in a separate patch Patch by Tom Stellard and Konstantin Zhuravlyov Differential Revision: https://reviews.llvm.org/D21562 llvm-svn: 280747 2016-09-07 04:22:28 +08:00			`ret void`
			`}`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00
			`; private resource: 4`
			`; scratch wave offset: 1`
			`; workgroup ids: 3`
			`; dispatch id: 2`
			`; queue ptr: 2`
			`; flat scratch init: 2`
			`; ---------------------`
			`; total: 14`

AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects Summary: This frees 2 scalar registers. Reviewers: tstellarAMD Subscribers: qcolombet, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D27150 llvm-svn: 289261 2016-12-10 03:49:48 +08:00			`; + reserved vcc = 16`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00
			`; Because we can't handle re-using the last few input registers as the`
			`; special vcc etc. registers (as well as decide to not use the unused`
			`; features when the number of registers is frozen), this ends up using`
			`; more than expected.`

			`; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:`
AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects Summary: This frees 2 scalar registers. Reviewers: tstellarAMD Subscribers: qcolombet, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D27150 llvm-svn: 289261 2016-12-10 03:49:48 +08:00			`; TOSGPR: SGPRBlocks: 1`
			`; TOSGPR: NumSGPRsForWavesPerEU: 16`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00
AMDGPU/SI: Don't reserve XNACK when it's disabled Summary: This frees 2 additional scalar registers. These are results from all of my 3 patches combined: Polaris: Spilled SGPRs: 2231 -> 1517 (-32.00 %) Tonga: Spilled SGPRs: 3829 -> 2608 (-31.89 %) Spilled VGPRs: 100 -> 84 (-16.00 %) Tonga even spills SGPRs via VGPRs to scratch. That's a compute shader limited to 64 VGPRs. Reviewers: tstellarAMD Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D27151 llvm-svn: 289262 2016-12-10 03:49:54 +08:00			`; TOSMEM: s_mov_b64 s[10:11], s[2:3]`
			`; TOSMEM: s_mov_b64 s[8:9], s[0:1]`
			`; TOSMEM: s_mov_b32 s7, s13`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00
AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects Summary: This frees 2 scalar registers. Reviewers: tstellarAMD Subscribers: qcolombet, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D27150 llvm-svn: 289261 2016-12-10 03:49:48 +08:00			`; TOSMEM: SGPRBlocks: 1`
			`; TOSMEM: NumSGPRsForWavesPerEU: 16`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00			`define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,`
			`i32 addrspace(1)* %out2,`
			`i32 addrspace(1)* %out3,`
			`i32 addrspace(1)* %out4,`
			`i32 %one, i32 %two, i32 %three, i32 %four) #2 {`
			`%x.0 = call i32 @llvm.amdgcn.workgroup.id.x()`
			`%x.1 = call i32 @llvm.amdgcn.workgroup.id.y()`
			`%x.2 = call i32 @llvm.amdgcn.workgroup.id.z()`
			`%x.3 = call i64 @llvm.amdgcn.dispatch.id()`
			`%x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()`
			`%x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()`
[AMDGPU] Revert failed scheduling This patch reverts region's scheduling to the original untouched state in case if we have have decreased occupancy. In addition it switches to use TargetRegisterInfo occupancy callback for pressure limits instead of gradually increasing limits which were just passed by. We are going to stay with the best schedule so we do not need to tolerate worsened scheduling anymore. Differential Revision: https://reviews.llvm.org/D29971 llvm-svn: 295206 2017-02-16 01:19:50 +08:00			`store volatile i32 0, i32* undef`
			`br label %stores`

			`stores:`
			`store volatile i32 %x.0, i32 addrspace(1)* undef`
			`store volatile i32 %x.0, i32 addrspace(1)* undef`
			`store volatile i32 %x.0, i32 addrspace(1)* undef`
			`store volatile i64 %x.3, i64 addrspace(1)* undef`
			`store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00			`store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef`

			`store i32 %one, i32 addrspace(1)* %out1`
			`store i32 %two, i32 addrspace(1)* %out2`
			`store i32 %three, i32 addrspace(1)* %out3`
			`store i32 %four, i32 addrspace(1)* %out4`
			`ret void`
			`}`

RegisterCoalscer: Only coalesce complete reserved registers. The coalescer eliminates copies from reserved registers of the form: %vregX = COPY %rY in the case where %rY is a reserved register. However this turns out to be invalid if only some of the subregisters are reserved (see also https://reviews.llvm.org/D26648). Differential Revision: https://reviews.llvm.org/D26687 llvm-svn: 288428 2016-12-02 06:39:51 +08:00			`; The following test is commented out for now; http://llvm.org/PR31230`
			`; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00			`; ; Make sure copies for input buffer are not clobbered. This requires`
			`; ; swapping the order the registers are copied from what normally`
			`; ; happens.`

RegisterCoalscer: Only coalesce complete reserved registers. The coalescer eliminates copies from reserved registers of the form: %vregX = COPY %rY in the case where %rY is a reserved register. However this turns out to be invalid if only some of the subregisters are reserved (see also https://reviews.llvm.org/D26648). Differential Revision: https://reviews.llvm.org/D26687 llvm-svn: 288428 2016-12-02 06:39:51 +08:00			`; XTOSMEM: s_mov_b32 s5, s11`
			`; XTOSMEM: s_add_u32 m0, s5,`
			`; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0`

			`; XALL: SGPRBlocks: 2`
			`; XALL: NumSGPRsForWavesPerEU: 18`
			`;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,`
			`; i32 addrspace(1)* %out2,`
			`; i32 addrspace(1)* %out3,`
			`; i32 addrspace(1)* %out4,`
			`; i32 %one, i32 %two, i32 %three, i32 %four) #2 {`
			`; store volatile i32 0, i32* undef`
			`; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()`
			`; store volatile i32 %x.0, i32 addrspace(1)* undef`
			`; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()`
			`; store volatile i32 %x.0, i32 addrspace(1)* undef`
			`; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()`
			`; store volatile i32 %x.0, i32 addrspace(1)* undef`
			`; %x.3 = call i64 @llvm.amdgcn.dispatch.id()`
			`; store volatile i64 %x.3, i64 addrspace(1)* undef`
			`; %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()`
			`; store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef`
			`;`
			`; store i32 %one, i32 addrspace(1)* %out1`
			`; store i32 %two, i32 addrspace(1)* %out2`
			`; store i32 %three, i32 addrspace(1)* %out3`
			`; store i32 %four, i32 addrspace(1)* %out4`
			`; ret void`
			`;}`
AMDGPU: Fix using incorrect private resource with no allocation It's possible to have a use of the private resource descriptor or scratch wave offset registers even though there are no allocated stack objects. This would result in continuing to use the maximum number reserved registers. This could go over the number of SGPRs available on VI, or violate the SGPR limit requested by the function attributes. llvm-svn: 285435 2016-10-29 03:43:31 +08:00
			`declare i32 @llvm.amdgcn.workgroup.id.x() #1`
			`declare i32 @llvm.amdgcn.workgroup.id.y() #1`
			`declare i32 @llvm.amdgcn.workgroup.id.z() #1`
			`declare i64 @llvm.amdgcn.dispatch.id() #1`
			`declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1`
			`declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1`

			`attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }`
			`attributes #1 = { nounwind readnone }`
			`attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }`
			`attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }`