AMDGPU: Annotate functions that have stack objects

Relying on any MachineFunction state in the MachineFunctionInfo constructor is hazardous, because the construction time is unclear and determined by the first use. The function may be only partially constructed, which is part of why we have many of these hacky string attributes to track what we need for ABI lowering. For SelectionDAG, all stack objects are created up-front before calling convention lowering so stack objects are visible at construction time. For GlobalISel, none of the IR function has been visited yet and the allocas haven't been added to the MachineFrameInfo yet. This should fix failing to set flat_scratch_init in GlobalISel when needed. This pass really needs to be turned into some kind of analysis, but I haven't found a nice way use one here.
2020-05-19 14:32:31 -04:00 · 2020-05-19 14:32:31 -04:00 · 21d2884a9c
parent 3d0d2fefc0
commit 21d2884a9c
4 changed files with 71 additions and 17 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@ -279,6 +279,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
  bool HasApertureRegs = ST.hasApertureRegs();
  SmallPtrSet<const Constant *, 8> ConstantExprVisited;

+  bool HaveStackObjects = false;
  bool Changed = false;
  bool NeedQueuePtr = false;
  bool HaveCall = false;
@ -286,6 +287,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {

  for (BasicBlock &BB : F) {
    for (Instruction &I : BB) {
+      if (isa<AllocaInst>(I)) {
+        HaveStackObjects = true;
+        continue;
+      }
+
      if (auto *CB = dyn_cast<CallBase>(&I)) {
        const Function *Callee =
            dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
@ -355,6 +361,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
    Changed = true;
  }

+  if (HaveStackObjects) {
+    F.addFnAttr("amdgpu-stack-objects");
+    Changed = true;
+  }
+
  return Changed;
 }

--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -55,11 +55,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)

  Occupancy = ST.computeOccupancy(MF, getLDSSize());
  CallingConv::ID CC = F.getCallingConv();
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();

  // FIXME: Should have analysis or something rather than attribute to detect
  // calls.
-  const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls");
+  const bool HasCalls = F.hasFnAttribute("amdgpu-calls");

  // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
  // have any calls.
@ -125,8 +124,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
      WorkItemIDZ = true;
  }

-  bool HasStackObjects = FrameInfo.hasStackObjects();
-
+  bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
  if (isEntryFunction()) {
    // X, XY, and XYZ are the only supported combinations, so make sure Y is
    // enabled if Z is.
@ -170,20 +168,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
    KernargSegmentPtr = true;

  if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
-    auto hasNonSpillStackObjects = [&]() {
-      // Avoid expensive checking if there's no stack objects.
-      if (!HasStackObjects)
-        return false;
-      for (auto OI = FrameInfo.getObjectIndexBegin(),
-                OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
-        if (!FrameInfo.isSpillSlotObjectIndex(OI))
-          return true;
-      // All stack objects are spill slots.
-      return false;
-    };
    // TODO: This could be refined a lot. The attribute is a poor way of
-    // detecting calls that may require it before argument lowering.
-    if (HasCalls || hasNonSpillStackObjects())
+    // detecting calls or stack objects that may require it before argument
+    // lowering.
+    if (HasCalls || HasStackObjects)
      FlatScratchInit = true;
  }

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure flat_scratch_init is set
+
+; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
+  %alloca = alloca i32, addrspace(5)
+  %cast = addrspacecast i32 addrspace(5)* %alloca to i32*
+  store volatile i32 0, i32* %cast
+  ret void
+}
+
+; TODO: Could optimize out in this case
+; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
+  %alloca = alloca i32, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
+define amdgpu_kernel void @kernel_no_calls_no_stack() {
+  ret void
+}
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s

+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
 declare i32 @llvm.amdgcn.workgroup.id.z() #0
@ -250,6 +252,31 @@ define amdgpu_kernel void @use_is_private(i8* %ptr) #1 {
  ret void
 }

+; HSA: define amdgpu_kernel void @use_alloca() #13 {
+define amdgpu_kernel void @use_alloca() #1 {
+  %alloca = alloca i32, addrspace(5)
+  store i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @use_alloca_non_entry_block() #13 {
+define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
+entry:
+  br label %bb
+
+bb:
+  %alloca = alloca i32, addrspace(5)
+  store i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; HSA: define void @use_alloca_func() #13 {
+define void @use_alloca_func() #1 {
+  %alloca = alloca i32, addrspace(5)
+  store i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind }

@ -266,3 +293,4 @@ attributes #1 = { nounwind }
 ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
 ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
 ; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
+; HSA: attributes #13 = { nounwind "amdgpu-stack-objects" }