AMDGPU: Annotate functions that have stack objects

Relying on any MachineFunction state in the MachineFunctionInfo
constructor is hazardous, because the construction time is unclear and
determined by the first use. The function may be only partially
constructed, which is part of why we have many of these hacky string
attributes to track what we need for ABI lowering.

For SelectionDAG, all stack objects are created up-front before
calling convention lowering so stack objects are visible at
construction time. For GlobalISel, none of the IR function has been
visited yet and the allocas haven't been added to the MachineFrameInfo
yet. This should fix failing to set flat_scratch_init in GlobalISel
when needed.

This pass really needs to be turned into some kind of analysis, but I
haven't found a nice way use one here.
This commit is contained in:
Matt Arsenault 2020-05-19 14:32:31 -04:00
parent 3d0d2fefc0
commit 21d2884a9c
4 changed files with 71 additions and 17 deletions

View File

@ -279,6 +279,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
bool HasApertureRegs = ST.hasApertureRegs();
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
bool HaveStackObjects = false;
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
@ -286,6 +287,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
if (isa<AllocaInst>(I)) {
HaveStackObjects = true;
continue;
}
if (auto *CB = dyn_cast<CallBase>(&I)) {
const Function *Callee =
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
@ -355,6 +361,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
}
if (HaveStackObjects) {
F.addFnAttr("amdgpu-stack-objects");
Changed = true;
}
return Changed;
}

View File

@ -55,11 +55,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
Occupancy = ST.computeOccupancy(MF, getLDSSize());
CallingConv::ID CC = F.getCallingConv();
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
// FIXME: Should have analysis or something rather than attribute to detect
// calls.
const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls");
const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
// Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
// have any calls.
@ -125,8 +124,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDZ = true;
}
bool HasStackObjects = FrameInfo.hasStackObjects();
bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
if (isEntryFunction()) {
// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
@ -170,20 +168,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
auto hasNonSpillStackObjects = [&]() {
// Avoid expensive checking if there's no stack objects.
if (!HasStackObjects)
return false;
for (auto OI = FrameInfo.getObjectIndexBegin(),
OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
if (!FrameInfo.isSpillSlotObjectIndex(OI))
return true;
// All stack objects are spill slots.
return false;
};
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls that may require it before argument lowering.
if (HasCalls || hasNonSpillStackObjects())
// detecting calls or stack objects that may require it before argument
// lowering.
if (HasCalls || HasStackObjects)
FlatScratchInit = true;
}

View File

@ -0,0 +1,27 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; Make sure flat_scratch_init is set
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5)
%cast = addrspacecast i32 addrspace(5)* %alloca to i32*
store volatile i32 0, i32* %cast
ret void
}
; TODO: Could optimize out in this case
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
ret void
}
; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
define amdgpu_kernel void @kernel_no_calls_no_stack() {
ret void
}

View File

@ -1,5 +1,7 @@
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
declare i32 @llvm.amdgcn.workgroup.id.x() #0
declare i32 @llvm.amdgcn.workgroup.id.y() #0
declare i32 @llvm.amdgcn.workgroup.id.z() #0
@ -250,6 +252,31 @@ define amdgpu_kernel void @use_is_private(i8* %ptr) #1 {
ret void
}
; HSA: define amdgpu_kernel void @use_alloca() #13 {
define amdgpu_kernel void @use_alloca() #1 {
%alloca = alloca i32, addrspace(5)
store i32 0, i32 addrspace(5)* %alloca
ret void
}
; HSA: define amdgpu_kernel void @use_alloca_non_entry_block() #13 {
define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
entry:
br label %bb
bb:
%alloca = alloca i32, addrspace(5)
store i32 0, i32 addrspace(5)* %alloca
ret void
}
; HSA: define void @use_alloca_func() #13 {
define void @use_alloca_func() #1 {
%alloca = alloca i32, addrspace(5)
store i32 0, i32 addrspace(5)* %alloca
ret void
}
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind }
@ -266,3 +293,4 @@ attributes #1 = { nounwind }
; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
; HSA: attributes #13 = { nounwind "amdgpu-stack-objects" }