[amdgpu] Elide module lds allocation in kernels with no callees

Introduces a string attribute, amdgpu-requires-module-lds, to allow eliding the module.lds block from kernels. Will allocate the block as before if the attribute is missing or has its default value of true. Patch uses the new attribute to detect the simplest possible instance of this, where a kernel makes no calls and thus cannot call any functions that use LDS. Tests updated to match, coverage was already good. Interesting cases is in lower-module-lds-offsets where annotating the kernel allows the backend to pick a different (in this case better) variable ordering than previously. A later patch will avoid moving kernel variables into module.lds when the kernel can have this attribute, allowing optimal ordering and locally unused variable elimination. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D122091
2022-05-04 22:42:05 +01:00 · 2022-05-04 22:42:05 +01:00 · bc78c09952
parent 411bb42eed
commit bc78c09952
9 changed files with 56 additions and 28 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@ -498,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
  const DataLayout &DL = F.getParent()->getDataLayout();

-  Info->allocateModuleLDSGlobal(F.getParent());
+  Info->allocateModuleLDSGlobal(F);

  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@ -583,7 +583,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
  const DataLayout &DL = F.getParent()->getDataLayout();

-  Info->allocateModuleLDSGlobal(F.getParent());
+  Info->allocateModuleLDSGlobal(F);

  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@ -163,9 +164,10 @@ public:
  }

  bool runOnModule(Module &M) override {
+    CallGraph CG = CallGraph(M);
    UsedList = getUsedList(M);
    bool Changed = superAlignLDSGlobals(M);
-    Changed |= processUsedLDS(M);
+    Changed |= processUsedLDS(CG, M);

    for (Function &F : M.functions()) {
      if (F.isDeclaration())
@ -174,7 +176,7 @@ public:
      // Only lower compute kernels' LDS.
      if (!AMDGPU::isKernel(F.getCallingConv()))
        continue;
-      Changed |= processUsedLDS(M, &F);
+      Changed |= processUsedLDS(CG, M, &F);
    }

    UsedList.clear();
@ -226,7 +228,7 @@ private:
    return Changed;
  }

-  bool processUsedLDS(Module &M, Function *F = nullptr) {
+  bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
    LLVMContext &Ctx = M.getContext();
    const DataLayout &DL = M.getDataLayout();

@ -374,7 +376,20 @@ private:
      IRBuilder<> Builder(Ctx);
      for (Function &Func : M.functions()) {
        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
-          markUsedByKernel(Builder, &Func, SGV);
+          const CallGraphNode *N = CG[&Func];
+          const bool CalleesRequireModuleLDS = N->size() > 0;
+
+          if (CalleesRequireModuleLDS) {
+            // If a function this kernel might call requires module LDS,
+            // annotate the kernel to let later passes know it will allocate
+            // this structure, even if not apparent from the IR.
+            markUsedByKernel(Builder, &Func, SGV);
+          } else {
+            // However if we are certain this kernel cannot call a function that
+            // requires module LDS, annotate the kernel so the backend can elide
+            // the allocation without repeating callgraph walks.
+            Func.addFnAttr("amdgpu-elide-module-lds");
+          }
        }
      }
    }
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@ -83,10 +83,16 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
  return Offset;
 }

-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+// This kernel calls no functions that require the module lds struct
+static bool canElideModuleLDS(const Function &F) {
+  return F.hasFnAttribute("amdgpu-elide-module-lds");
+}
+
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+  const Module *M = F.getParent();
  if (isModuleEntryFunction()) {
    const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
-    if (GV) {
+    if (GV && !canElideModuleLDS(F)) {
      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
      (void)Offset;
      assert(Offset == 0 &&
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@ -102,7 +102,7 @@ public:
  }

  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
-  void allocateModuleLDSGlobal(const Module *M);
+  void allocateModuleLDSGlobal(const Function &F);

  Align getDynLDSAlign() const { return DynLDSAlign; }

--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -2447,7 +2447,7 @@ SDValue SITargetLowering::LowerFormalArguments(
    return DAG.getEntryNode();
  }

-  Info->allocateModuleLDSGlobal(Fn.getParent());
+  Info->allocateModuleLDSGlobal(Fn);

  SmallVector<ISD::InputArg, 16> Splits;
  SmallVector<CCValAssign, 16> ArgLocs;
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@ -21,9 +21,8 @@
 ; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
 ; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
 ;.
-define amdgpu_kernel void @k0() {
+define amdgpu_kernel void @k0() #0 {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
@ -49,9 +48,8 @@ define amdgpu_kernel void @k0() {
  ret void
 }

-define amdgpu_kernel void @k1() {
+define amdgpu_kernel void @k1() #0 {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
 ; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
@ -72,9 +70,8 @@ define amdgpu_kernel void @k1() {
  ret void
 }

-define amdgpu_kernel void @0() {
+define amdgpu_kernel void @0() #0 {
 ; CHECK-LABEL: @0(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
 ; CHECK-NEXT:    ret void
@ -85,9 +82,8 @@ define amdgpu_kernel void @0() {
  ret void
 }

-define amdgpu_kernel void @1() {
+define amdgpu_kernel void @1() #0 {
 ; CHECK-LABEL: @1(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
 ; CHECK-NEXT:    ret void
@ -114,6 +110,6 @@ define void @f0() {

  ret void
 }
-;.
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind readnone willreturn }
-;.
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@ -35,8 +35,8 @@ entry:
  ret void
 }

-; CHECK-LABEL: @timestwo()
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-LABEL: @timestwo() #0
+; CHECK-NOT: call void @llvm.donothing()
 ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
 ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
 ; CHECK: %3 = ptrtoint i32* %2 to i64
@ -56,3 +56,6 @@ define amdgpu_kernel void @timestwo() {
  store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
  ret void
 }
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; Check that module LDS is allocated at address 0 and kernel starts its
-; allocation past module LDS.
+; allocation past module LDS when a call is present.

@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
@ -22,12 +22,14 @@ define amdgpu_kernel void @k0() {
 ; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
 ; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
+; OPT-NEXT:    call void @f0()
 ; OPT-NEXT:    ret void
 ;
  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
  store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+  call void @f0()
  ret void
 }

@ -36,7 +38,7 @@ define amdgpu_kernel void @k0() {
 ; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
 ; GCN:     ds_write_b8 [[NULL]], [[TREE]]
 define void @f0() {
-; OPT-LABEL: @f0(
+; OPT-LABEL: @f0() {
 ; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
 ; OPT-NEXT:    ret void
@ -45,3 +47,6 @@ define void @f0() {
  store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
  ret void
 }
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@ -48,13 +48,16 @@ define amdgpu_kernel void @kern_call() {
  ret void
 }

-; This kernel does not need to alloc the LDS block as it makes no calls
+; This kernel does alloc the LDS block as it makes no calls
 ; CHECK-LABEL: @kern_empty()
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-define spir_kernel void @kern_empty() {
+; CHECK-NOT: call void @llvm.donothing()
+define spir_kernel void @kern_empty() #0{
  ret void
 }

 ; Make sure we don't crash trying to insert code into a kernel
 ; declaration.
 declare amdgpu_kernel void @kernel_declaration()
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }