[AMDGPU] Report minimum scratch size in code object v5 and later by default

This change sets
-amdgpu-assume-{external-call-stack-size | dynamic-stack-object-size}
options to zero by default for code object v5 and later. The runtime is
expected to adjust the scratch size if the amdhsa_uses_dynamic_stack bit
in the kernel descriptor is set.

Differential Revision: https://reviews.llvm.org/D128346
This commit is contained in:
Abinav Puthan Purayil 2022-09-15 18:40:36 +05:30
parent b55e76de8b
commit 3759398b4b
5 changed files with 51 additions and 9 deletions

View File

@ -3884,11 +3884,12 @@ The fields used by CP for code objects before V3 also match those specified in
63:32 4 bytes PRIVATE_SEGMENT_FIXED_SIZE The amount of fixed
private address space
memory required for a
work-item in bytes.
Additional space may need to
be added to this value if
the call stack has
non-inlined function calls.
work-item in bytes. When
this cannot be predicted,
code object v4 and older
sets this value to be
higher than the minimum
requirement.
95:64 4 bytes KERNARG_SIZE The size of the kernarg
memory pointed to by the
AQL dispatch packet. The

View File

@ -43,9 +43,9 @@ using namespace llvm::AMDGPU;
char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
// We need to tell the runtime some amount ahead of time if we don't know the
// true stack size. Assume a smaller number if this is only due to dynamic /
// non-entry block allocas.
// In code object v4 and older, we need to tell the runtime some amount ahead of
// time if we don't know the true stack size. Assume a smaller number if this is
// only due to dynamic / non-entry block allocas.
static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
"amdgpu-assume-external-call-stack-size",
cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
@ -109,6 +109,15 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
auto End = po_end(&CG);
// By default, for code object v5 and later, track only the minimum scratch
// size
if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
AssumedStackSizeForDynamicSizeObjects = 0;
if (!AssumedStackSizeForExternalCall.getNumOccurrences())
AssumedStackSizeForExternalCall = 0;
}
for (auto IT = po_begin(&CG); IT != End; ++IT) {
Function *F = IT->getFunction();
if (!F || F->isDeclaration())

View File

@ -1,4 +1,5 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-V5 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
@ -182,6 +183,9 @@ declare void @external() #0
; NumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
;
; GCN-V5-LABEL: {{^}}usage_external:
; GCN-V5: ScratchSize: 0
define amdgpu_kernel void @usage_external() #0 {
call void @external()
ret void
@ -194,6 +198,9 @@ declare void @external_recurse() #2
; NumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
;
; GCN-V5-LABEL: {{^}}usage_external_recurse:
; GCN-V5: ScratchSize: 0
define amdgpu_kernel void @usage_external_recurse() #0 {
call void @external_recurse()
ret void
@ -201,6 +208,9 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
; GCN-LABEL: {{^}}direct_recursion_use_stack:
; GCN: ScratchSize: 18448{{$}}
;
; GCN-V5-LABEL: {{^}}direct_recursion_use_stack:
; GCN-V5: ScratchSize: 2064{{$}}
define void @direct_recursion_use_stack(i32 %val) #2 {
%alloca = alloca [512 x i32], align 4, addrspace(5)
call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0
@ -220,6 +230,9 @@ ret:
; GCN: is_ptr64 = 1
; GCN: is_dynamic_callstack = 1
; GCN: workitem_private_segment_byte_size = 18448{{$}}
;
; GCN-V5-LABEL: {{^}}usage_direct_recursion:
; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s
@ -110,6 +112,9 @@ bb.2:
}
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
; DEFAULTSIZE: ; ScratchSize: 4112
; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
; DEFAULTSIZE-V5: ; ScratchSize: 16
; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
; ASSUME1024: ; ScratchSize: 1040
@ -203,6 +208,9 @@ bb.1:
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
; DEFAULTSIZE: ; ScratchSize: 4160
; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
; DEFAULTSIZE-V5: ; ScratchSize: 64
; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
; ASSUME1024: ; ScratchSize: 1088

View File

@ -1,4 +1,5 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=V5 %s
; CHECK-LABEL: {{^}}recursive:
; CHECK: ScratchSize: 16
@ -28,9 +29,13 @@ define void @tail_recursive_with_stack() {
ret void
}
; For an arbitrary recursive call, report a large number for unknown stack usage.
; For an arbitrary recursive call, report a large number for unknown stack
; usage for code object v4 and older
; CHECK-LABEL: {{^}}calls_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
;
; V5-LABEL: {{^}}calls_recursive:
; V5: .amdhsa_private_segment_fixed_size 0{{$}}
define amdgpu_kernel void @calls_recursive() {
call void @recursive()
ret void
@ -51,6 +56,9 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive:
; V5: .amdhsa_private_segment_fixed_size 0{{$}}
define amdgpu_kernel void @kernel_calls_tail_recursive() {
call void @tail_recursive()
ret void
@ -58,6 +66,9 @@ define amdgpu_kernel void @kernel_calls_tail_recursive() {
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; V5: .amdhsa_private_segment_fixed_size 8{{$}}
define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
call void @tail_recursive_with_stack()
ret void