From 3759398b4bf2d8b72f305dbfa6aa4108a2bfc273 Mon Sep 17 00:00:00 2001 From: Abinav Puthan Purayil Date: Thu, 15 Sep 2022 18:40:36 +0530 Subject: [PATCH] [AMDGPU] Report minimum scratch size in code object v5 and later by default This change sets -amdgpu-assume-{external-call-stack-size | dynamic-stack-object-size} options to zero by default for code object v5 and later. The runtime is expected to adjust the scratch size if the amdhsa_uses_dynamic_stack bit in the kernel descriptor is set. Differential Revision: https://reviews.llvm.org/D128346 --- llvm/docs/AMDGPUUsage.rst | 11 ++++++----- .../Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 15 ++++++++++++--- .../CodeGen/AMDGPU/call-graph-register-usage.ll | 13 +++++++++++++ llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 8 ++++++++ llvm/test/CodeGen/AMDGPU/recursion.ll | 13 ++++++++++++- 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index f591e60862fb..1e1765bd6262 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -3884,11 +3884,12 @@ The fields used by CP for code objects before V3 also match those specified in 63:32 4 bytes PRIVATE_SEGMENT_FIXED_SIZE The amount of fixed private address space memory required for a - work-item in bytes. - Additional space may need to - be added to this value if - the call stack has - non-inlined function calls. + work-item in bytes. When + this cannot be predicted, + code object v4 and older + sets this value to be + higher than the minimum + requirement. 95:64 4 bytes KERNARG_SIZE The size of the kernarg memory pointed to by the AQL dispatch packet. The diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index bacf45639a77..ede2b2b671c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -43,9 +43,9 @@ using namespace llvm::AMDGPU; char llvm::AMDGPUResourceUsageAnalysis::ID = 0; char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; -// We need to tell the runtime some amount ahead of time if we don't know the -// true stack size. Assume a smaller number if this is only due to dynamic / -// non-entry block allocas. +// In code object v4 and older, we need to tell the runtime some amount ahead of +// time if we don't know the true stack size. Assume a smaller number if this is +// only due to dynamic / non-entry block allocas. static cl::opt AssumedStackSizeForExternalCall( "amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, @@ -109,6 +109,15 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { CallGraph CG = CallGraph(M); auto End = po_end(&CG); + // By default, for code object v5 and later, track only the minimum scratch + // size + if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) { + if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) + AssumedStackSizeForDynamicSizeObjects = 0; + if (!AssumedStackSizeForExternalCall.getNumOccurrences()) + AssumedStackSizeForExternalCall = 0; + } + for (auto IT = po_begin(&CG); IT != End; ++IT) { Function *F = IT->getFunction(); if (!F || F->isDeclaration()) diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 1a9ec6c31555..fe9743bc690c 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-V5 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s @@ -182,6 +183,9 @@ declare void @external() #0 ; NumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 +; +; GCN-V5-LABEL: {{^}}usage_external: +; GCN-V5: ScratchSize: 0 define amdgpu_kernel void @usage_external() #0 { call void @external() ret void @@ -194,6 +198,9 @@ declare void @external_recurse() #2 ; NumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 +; +; GCN-V5-LABEL: {{^}}usage_external_recurse: +; GCN-V5: ScratchSize: 0 define amdgpu_kernel void @usage_external_recurse() #0 { call void @external_recurse() ret void @@ -201,6 +208,9 @@ define amdgpu_kernel void @usage_external_recurse() #0 { ; GCN-LABEL: {{^}}direct_recursion_use_stack: ; GCN: ScratchSize: 18448{{$}} +; +; GCN-V5-LABEL: {{^}}direct_recursion_use_stack: +; GCN-V5: ScratchSize: 2064{{$}} define void @direct_recursion_use_stack(i32 %val) #2 { %alloca = alloca [512 x i32], align 4, addrspace(5) call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0 @@ -220,6 +230,9 @@ ret: ; GCN: is_ptr64 = 1 ; GCN: is_dynamic_callstack = 1 ; GCN: workitem_private_segment_byte_size = 18448{{$}} +; +; GCN-V5-LABEL: {{^}}usage_direct_recursion: +; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}} define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { call void @direct_recursion_use_stack(i32 %n) ret void diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index d626d8477eda..b180df078282 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s @@ -110,6 +112,9 @@ bb.2: } ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 ; DEFAULTSIZE: ; ScratchSize: 4112 +; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16 +; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1 +; DEFAULTSIZE-V5: ; ScratchSize: 16 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 ; ASSUME1024: ; ScratchSize: 1040 @@ -203,6 +208,9 @@ bb.1: ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 ; DEFAULTSIZE: ; ScratchSize: 4160 +; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64 +; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1 +; DEFAULTSIZE-V5: ; ScratchSize: 64 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 ; ASSUME1024: ; ScratchSize: 1088 diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll index 14c97508da6a..8e8657d0b9af 100644 --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=V5 %s ; CHECK-LABEL: {{^}}recursive: ; CHECK: ScratchSize: 16 @@ -28,9 +29,13 @@ define void @tail_recursive_with_stack() { ret void } -; For an arbitrary recursive call, report a large number for unknown stack usage. +; For an arbitrary recursive call, report a large number for unknown stack +; usage for code object v4 and older ; CHECK-LABEL: {{^}}calls_recursive: ; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}} +; +; V5-LABEL: {{^}}calls_recursive: +; V5: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @calls_recursive() { call void @recursive() ret void @@ -51,6 +56,9 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: ; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; +; V5-LABEL: {{^}}kernel_calls_tail_recursive: +; V5: .amdhsa_private_segment_fixed_size 0{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() ret void @@ -58,6 +66,9 @@ define amdgpu_kernel void @kernel_calls_tail_recursive() { ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: ; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; +; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: +; V5: .amdhsa_private_segment_fixed_size 8{{$}} define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() ret void