forked from OSchip/llvm-project
AMDGPU: Fix allocating GDS globals to LDS offsets
These don't seem to be very well used or tested, but try to make the behavior a bit more consistent with LDS globals. I'm not sure what the definition for amdgpu-gds-size is supposed to mean. For now I assumed it's allocating a static size at the beginning of the allocation, and any known globals are allocated after it.
This commit is contained in:
parent
378bb8014d
commit
b5ec131267
|
@ -32,6 +32,15 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
|
|||
Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
|
||||
WaveLimiter = WaveLimitAttr.getValueAsBool();
|
||||
|
||||
// FIXME: How is this attribute supposed to interact with statically known
|
||||
// global sizes?
|
||||
StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
|
||||
if (!S.empty())
|
||||
S.consumeInteger(0, GDSSize);
|
||||
|
||||
// Assume the attribute allocates before any known GDS globals.
|
||||
StaticGDSSize = GDSSize;
|
||||
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
|
||||
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
|
||||
|
@ -46,18 +55,27 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
|
|||
Align Alignment =
|
||||
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
|
||||
|
||||
unsigned Offset;
|
||||
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
||||
/// TODO: We should sort these to minimize wasted space due to alignment
|
||||
/// padding. Currently the padding is decided by the first encountered use
|
||||
/// during lowering.
|
||||
unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
|
||||
Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
|
||||
|
||||
Entry.first->second = Offset;
|
||||
StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
|
||||
|
||||
// Update the LDS size considering the padding to align the dynamic shared
|
||||
// memory.
|
||||
LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
|
||||
} else {
|
||||
Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
|
||||
StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
|
||||
|
||||
// FIXME: Apply alignment of dynamic GDS
|
||||
GDSSize = StaticGDSSize;
|
||||
}
|
||||
|
||||
Entry.first->second = Offset;
|
||||
return Offset;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "AMDGPU.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
|
@ -25,11 +26,13 @@ protected:
|
|||
Align MaxKernArgAlign; // Cache for this.
|
||||
|
||||
/// Number of bytes in the LDS that are being used.
|
||||
unsigned LDSSize = 0;
|
||||
uint32_t LDSSize = 0;
|
||||
uint32_t GDSSize = 0;
|
||||
|
||||
/// Number of bytes in the LDS allocated statically. This field is only used
|
||||
/// in the instruction selector and not part of the machine function info.
|
||||
unsigned StaticLDSSize = 0;
|
||||
uint32_t StaticLDSSize = 0;
|
||||
uint32_t StaticGDSSize = 0;
|
||||
|
||||
/// Align for dynamic shared memory if any. Dynamic shared memory is
|
||||
/// allocated directly after the static one, i.e., LDSSize. Need to pad
|
||||
|
@ -69,6 +72,10 @@ public:
|
|||
return LDSSize;
|
||||
}
|
||||
|
||||
uint32_t getGDSSize() const {
|
||||
return GDSSize;
|
||||
}
|
||||
|
||||
AMDGPU::SIModeRegisterDefaults getMode() const {
|
||||
return Mode;
|
||||
}
|
||||
|
|
|
@ -48,8 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
ImplicitBufferPtr(false),
|
||||
ImplicitArgPtr(false),
|
||||
GITPtrHigh(0xffffffff),
|
||||
HighBitsOf32BitAddress(0),
|
||||
GDSSize(0) {
|
||||
HighBitsOf32BitAddress(0) {
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const Function &F = MF.getFunction();
|
||||
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
|
||||
|
@ -184,10 +183,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
if (!S.empty())
|
||||
S.consumeInteger(0, HighBitsOf32BitAddress);
|
||||
|
||||
S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
|
||||
if (!S.empty())
|
||||
S.consumeInteger(0, GDSSize);
|
||||
|
||||
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
|
||||
// VGPR available at all times.
|
||||
if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
|
||||
|
|
|
@ -437,7 +437,6 @@ private:
|
|||
unsigned GITPtrHigh;
|
||||
|
||||
unsigned HighBitsOf32BitAddress;
|
||||
unsigned GDSSize;
|
||||
|
||||
// Current recorded maximum possible occupancy.
|
||||
unsigned Occupancy;
|
||||
|
@ -748,10 +747,6 @@ public:
|
|||
return HighBitsOf32BitAddress;
|
||||
}
|
||||
|
||||
unsigned getGDSSize() const {
|
||||
return GDSSize;
|
||||
}
|
||||
|
||||
unsigned getNumUserSGPRs() const {
|
||||
return NumUserSGPRs;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
@gds0 = internal addrspace(2) global [4 x i32] undef, align 4
|
||||
@lds0 = internal addrspace(3) global [4 x i32] undef, align 128
|
||||
@lds1 = internal addrspace(3) global [4 x i32] undef, align 256
|
||||
|
||||
; These two objects should be allocated at the same constant offsets
|
||||
; from the base.
|
||||
define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 {
|
||||
; GCN-LABEL: alloc_lds_gds:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_mov_b32 m0, 16
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:12
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
|
||||
%val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
|
||||
%gep.lds = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
|
||||
%val1 = atomicrmw add i32 addrspace(3)* %gep.lds, i32 5 acq_rel
|
||||
ret void
|
||||
}
|
||||
|
||||
; The LDS alignment shouldn't change offset of GDS.
|
||||
define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 {
|
||||
; GCN-LABEL: alloc_lds_gds_align:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_mov_b32 m0, 16
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:140
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:12
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
|
||||
%val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
|
||||
|
||||
%gep.lds0 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
|
||||
%val1 = atomicrmw add i32 addrspace(3)* %gep.lds0, i32 5 acq_rel
|
||||
|
||||
%gep.lds1 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds1, i32 0, i32 3
|
||||
%val2 = atomicrmw add i32 addrspace(3)* %gep.lds1, i32 5 acq_rel
|
||||
ret void
|
||||
}
|
||||
|
||||
@gds_align8 = internal addrspace(2) global [4 x i32] undef, align 8
|
||||
@gds_align32 = internal addrspace(2) global [4 x i32] undef, align 32
|
||||
|
||||
define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) {
|
||||
; GCN-LABEL: gds_global_align:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_mov_b32 m0, 32
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:28 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_endpgm
|
||||
%gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
|
||||
%val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
|
||||
%gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
|
||||
%val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0 {
|
||||
; GCN-LABEL: gds_global_align_plus_attr:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_movk_i32 m0, 0x420
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:1052 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:1036 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_endpgm
|
||||
%gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
|
||||
%val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
|
||||
%gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
|
||||
%val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
|
||||
ret void
|
||||
}
|
||||
|
||||
@small.gds = internal addrspace(2) global i8 undef, align 1
|
||||
@gds.external = external unnamed_addr addrspace(3) global [0 x i32], align 4
|
||||
|
||||
define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] addrspace(2)* %gds.arg) #0 {
|
||||
; GCN-LABEL: gds_extern_align:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s0, s[0:1], 0x8
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 5
|
||||
; GCN-NEXT: s_movk_i32 m0, 0x401
|
||||
; GCN-NEXT: s_movk_i32 s1, 0x400
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ; use s1
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1
|
||||
; GCN-NEXT: s_endpgm
|
||||
call void asm sideeffect "; use $0","s"(i8 addrspace(2)* @small.gds)
|
||||
%gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* %gds.arg, i32 0, i32 3
|
||||
%val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-gds-size"="1024" }
|
|
@ -53,7 +53,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
|||
; CHECK-NEXT: explicitKernArgSize: 0
|
||||
; CHECK-NEXT: maxKernArgAlign: 4
|
||||
; CHECK-NEXT: ldsSize: 0
|
||||
; CHECK-NEXT: gdsSize: 0
|
||||
; CHECK-NEXT: gdsSize: 512
|
||||
; CHECK-NEXT: dynLDSAlign: 1
|
||||
; CHECK-NEXT: isEntryFunction: true
|
||||
; CHECK-NEXT: noSignedZerosFPMath: false
|
||||
|
@ -80,6 +80,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
|||
; CHECK-NEXT: occupancy: 10
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
|
||||
%gep = getelementptr inbounds [128 x i32], [128 x i32] addrspace(2)* @gds, i32 0, i32 %arg0
|
||||
atomicrmw add i32 addrspace(2)* %gep, i32 8 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue