AMDGPU: Fix allocating GDS globals to LDS offsets

These don't seem to be very well used or tested, but try to make the behavior a bit more consistent with LDS globals. I'm not sure what the definition for amdgpu-gds-size is supposed to mean. For now I assumed it's allocating a static size at the beginning of the allocation, and any known globals are allocated after it.
2022-04-16 10:28:51 -04:00 · 2022-04-16 10:28:51 -04:00 · b5ec131267
parent 378bb8014d
commit b5ec131267
6 changed files with 173 additions and 24 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@ -32,6 +32,15 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
  Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
  WaveLimiter = WaveLimitAttr.getValueAsBool();

+  // FIXME: How is this attribute supposed to interact with statically known
+  // global sizes?
+  StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, GDSSize);
+
+  // Assume the attribute allocates before any known GDS globals.
+  StaticGDSSize = GDSSize;
+
  CallingConv::ID CC = F.getCallingConv();
  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
    ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@ -46,18 +55,27 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
  Align Alignment =
      DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());

-  /// TODO: We should sort these to minimize wasted space due to alignment
-  /// padding. Currently the padding is decided by the first encountered use
-  /// during lowering.
-  unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
+  unsigned Offset;
+  if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    /// TODO: We should sort these to minimize wasted space due to alignment
+    /// padding. Currently the padding is decided by the first encountered use
+    /// during lowering.
+    Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
+
+    StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+    // Update the LDS size considering the padding to align the dynamic shared
+    // memory.
+    LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+  } else {
+    Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
+    StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+    // FIXME: Apply alignment of dynamic GDS
+    GDSSize = StaticGDSSize;
+  }

  Entry.first->second = Offset;
-  StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
-
-  // Update the LDS size considering the padding to align the dynamic shared
-  // memory.
-  LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
-
  return Offset;
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@ -12,6 +12,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "AMDGPU.h"

 namespace llvm {

@ -25,11 +26,13 @@ protected:
  Align MaxKernArgAlign;        // Cache for this.

  /// Number of bytes in the LDS that are being used.
-  unsigned LDSSize = 0;
+  uint32_t LDSSize = 0;
+  uint32_t GDSSize = 0;

  /// Number of bytes in the LDS allocated statically. This field is only used
  /// in the instruction selector and not part of the machine function info.
-  unsigned StaticLDSSize = 0;
+  uint32_t StaticLDSSize = 0;
+  uint32_t StaticGDSSize = 0;

  /// Align for dynamic shared memory if any. Dynamic shared memory is
  /// allocated directly after the static one, i.e., LDSSize. Need to pad
@ -69,6 +72,10 @@ public:
    return LDSSize;
  }

+  uint32_t getGDSSize() const {
+    return GDSSize;
+  }
+
  AMDGPU::SIModeRegisterDefaults getMode() const {
    return Mode;
  }
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -48,8 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
    ImplicitBufferPtr(false),
    ImplicitArgPtr(false),
    GITPtrHigh(0xffffffff),
-    HighBitsOf32BitAddress(0),
-    GDSSize(0) {
+    HighBitsOf32BitAddress(0) {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const Function &F = MF.getFunction();
  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@ -184,10 +183,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
  if (!S.empty())
    S.consumeInteger(0, HighBitsOf32BitAddress);

-  S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
-  if (!S.empty())
-    S.consumeInteger(0, GDSSize);
-
  // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
  // VGPR available at all times.
  if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -437,7 +437,6 @@ private:
  unsigned GITPtrHigh;

  unsigned HighBitsOf32BitAddress;
-  unsigned GDSSize;

  // Current recorded maximum possible occupancy.
  unsigned Occupancy;
@ -748,10 +747,6 @@ public:
    return HighBitsOf32BitAddress;
  }

-  unsigned getGDSSize() const {
-    return GDSSize;
-  }
-
  unsigned getNumUserSGPRs() const {
    return NumUserSGPRs;
  }
--- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+@gds0 = internal addrspace(2) global [4 x i32] undef, align 4
+@lds0 = internal addrspace(3) global [4 x i32] undef, align 128
+@lds1 = internal addrspace(3) global [4 x i32] undef, align 256
+
+; These two objects should be allocated at the same constant offsets
+; from the base.
+define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 {
+; GCN-LABEL: alloc_lds_gds:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 m0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+  %gep.lds = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(3)* %gep.lds, i32 5 acq_rel
+  ret void
+}
+
+; The LDS alignment shouldn't change offset of GDS.
+define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 {
+; GCN-LABEL: alloc_lds_gds_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 m0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:140
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+
+  %gep.lds0 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(3)* %gep.lds0, i32 5 acq_rel
+
+  %gep.lds1 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds1, i32 0, i32 3
+  %val2 = atomicrmw add i32 addrspace(3)* %gep.lds1, i32 5 acq_rel
+  ret void
+}
+
+@gds_align8 = internal addrspace(2) global [4 x i32] undef, align 8
+@gds_align32 = internal addrspace(2) global [4 x i32] undef, align 32
+
+define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) {
+; GCN-LABEL: gds_global_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 m0, 32
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:28 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_endpgm
+  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0 {
+; GCN-LABEL: gds_global_align_plus_attr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_movk_i32 m0, 0x420
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:1052 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:1036 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_endpgm
+  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
+  %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+  ret void
+}
+
+@small.gds = internal addrspace(2) global i8 undef, align 1
+@gds.external = external unnamed_addr addrspace(3) global [0 x i32], align 4
+
+define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] addrspace(2)* %gds.arg) #0 {
+; GCN-LABEL: gds_extern_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GCN-NEXT:    v_mov_b32_e32 v0, 5
+; GCN-NEXT:    s_movk_i32 m0, 0x401
+; GCN-NEXT:    s_movk_i32 s1, 0x400
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1
+; GCN-NEXT:    s_endpgm
+  call void asm sideeffect "; use $0","s"(i8 addrspace(2)* @small.gds)
+  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* %gds.arg, i32 0, i32 3
+  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-gds-size"="1024" }
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@ -53,7 +53,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: explicitKernArgSize: 0
 ; CHECK-NEXT: maxKernArgAlign: 4
 ; CHECK-NEXT: ldsSize: 0
-; CHECK-NEXT: gdsSize: 0
+; CHECK-NEXT: gdsSize: 512
 ; CHECK-NEXT: dynLDSAlign: 1
 ; CHECK-NEXT: isEntryFunction: true
 ; CHECK-NEXT: noSignedZerosFPMath: false
@ -80,6 +80,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: occupancy: 10
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
+  %gep = getelementptr inbounds [128 x i32], [128 x i32] addrspace(2)* @gds, i32 0, i32 %arg0
+  atomicrmw add i32 addrspace(2)* %gep, i32 8 seq_cst
  ret void
 }