forked from OSchip/llvm-project
[amdgpu] Add codegen support for HIP dynamic shared memory.
Summary: - HIP uses an unsized extern array `extern __shared__ T s[]` to declare the dynamic shared memory, which size is not known at the compile time. Reviewers: arsenm, yaxunl, kpyzhov, b-sumner Subscribers: kzhuravl, jvesely, wdng, nhaehnle, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D82496
This commit is contained in:
parent
e2ab5bcf56
commit
5257a60ee0
|
@ -159,6 +159,22 @@ template <> struct ScalarTraits<MaybeAlign> {
|
|||
static QuotingType mustQuote(StringRef) { return QuotingType::None; }
|
||||
};
|
||||
|
||||
template <> struct ScalarTraits<Align> {
|
||||
static void output(const Align &Alignment, void *, llvm::raw_ostream &OS) {
|
||||
OS << Alignment.value();
|
||||
}
|
||||
static StringRef input(StringRef Scalar, void *, Align &Alignment) {
|
||||
unsigned long long N;
|
||||
if (getAsUnsignedInteger(Scalar, 10, N))
|
||||
return "invalid number";
|
||||
if (!isPowerOf2_64(N))
|
||||
return "must be a power of two";
|
||||
Alignment = Align(N);
|
||||
return StringRef();
|
||||
}
|
||||
static QuotingType mustQuote(StringRef) { return QuotingType::None; }
|
||||
};
|
||||
|
||||
} // end namespace yaml
|
||||
} // end namespace llvm
|
||||
|
||||
|
|
|
@ -2279,6 +2279,25 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
|
|||
return true; // Leave in place;
|
||||
}
|
||||
|
||||
if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
|
||||
Type *Ty = GV->getValueType();
|
||||
// HIP uses an unsized array `extern __shared__ T s[]` or similar
|
||||
// zero-sized type in other languages to declare the dynamic shared
|
||||
// memory which size is not known at the compile time. They will be
|
||||
// allocated by the runtime and placed directly after the static
|
||||
// allocated ones. They all share the same offset.
|
||||
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
|
||||
// Adjust alignment for that dynamic shared memory array.
|
||||
MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
|
||||
LLT S32 = LLT::scalar(32);
|
||||
auto Sz =
|
||||
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
|
||||
B.buildIntToPtr(DstReg, Sz);
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
B.buildConstant(
|
||||
DstReg,
|
||||
MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
|
||||
|
|
|
@ -49,10 +49,27 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
|
|||
/// TODO: We should sort these to minimize wasted space due to alignment
|
||||
/// padding. Currently the padding is decided by the first encountered use
|
||||
/// during lowering.
|
||||
unsigned Offset = LDSSize = alignTo(LDSSize, Alignment);
|
||||
unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
|
||||
|
||||
Entry.first->second = Offset;
|
||||
LDSSize += DL.getTypeAllocSize(GV.getValueType());
|
||||
StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
|
||||
|
||||
// Update the LDS size considering the padding to align the dynamic shared
|
||||
// memory.
|
||||
LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
|
||||
|
||||
return Offset;
|
||||
}
|
||||
|
||||
void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
|
||||
const GlobalVariable &GV) {
|
||||
assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
|
||||
|
||||
Align Alignment =
|
||||
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
|
||||
if (Alignment <= DynLDSAlign)
|
||||
return;
|
||||
|
||||
LDSSize = alignTo(StaticLDSSize, Alignment);
|
||||
DynLDSAlign = Alignment;
|
||||
}
|
||||
|
|
|
@ -9,9 +9,10 @@
|
|||
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
|
||||
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
|
||||
|
||||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "llvm/Support/Alignment.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
|
@ -29,6 +30,17 @@ protected:
|
|||
/// Number of bytes in the LDS that are being used.
|
||||
unsigned LDSSize = 0;
|
||||
|
||||
/// Number of bytes in the LDS allocated statically. This field is only used
|
||||
/// in the instruction selector and not part of the machine function info.
|
||||
unsigned StaticLDSSize = 0;
|
||||
|
||||
/// Align for dynamic shared memory if any. Dynamic shared memory is
|
||||
/// allocated directly after the static one, i.e., LDSSize. Need to pad
|
||||
/// LDSSize to ensure that dynamic one is aligned accordingly.
|
||||
/// The maximal alignment is updated during IR translation or lowering
|
||||
/// stages.
|
||||
Align DynLDSAlign;
|
||||
|
||||
// State of MODE register, assumed FP mode.
|
||||
AMDGPU::SIModeRegisterDefaults Mode;
|
||||
|
||||
|
@ -78,6 +90,10 @@ public:
|
|||
}
|
||||
|
||||
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
|
||||
|
||||
Align getDynLDSAlign() const { return DynLDSAlign; }
|
||||
|
||||
void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -5571,15 +5571,32 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
|
|||
SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
|
||||
SDLoc DL(GSD);
|
||||
EVT PtrVT = Op.getValueType();
|
||||
|
||||
const GlobalValue *GV = GSD->getGlobal();
|
||||
if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
|
||||
shouldUseLDSConstAddress(GV)) ||
|
||||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
|
||||
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
|
||||
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
|
||||
GV->hasExternalLinkage()) {
|
||||
Type *Ty = GV->getValueType();
|
||||
// HIP uses an unsized array `extern __shared__ T s[]` or similar
|
||||
// zero-sized type in other languages to declare the dynamic shared
|
||||
// memory which size is not known at the compile time. They will be
|
||||
// allocated by the runtime and placed directly after the static
|
||||
// allocated ones. They all share the same offset.
|
||||
if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
|
||||
assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
|
||||
// Adjust alignment for that dynamic shared memory array.
|
||||
MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
|
||||
return SDValue(
|
||||
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
|
||||
}
|
||||
}
|
||||
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
|
||||
|
||||
SDLoc DL(GSD);
|
||||
EVT PtrVT = Op.getValueType();
|
||||
}
|
||||
|
||||
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
||||
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
|
||||
|
|
|
@ -537,23 +537,20 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
|
|||
}
|
||||
|
||||
yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
|
||||
const llvm::SIMachineFunctionInfo& MFI,
|
||||
const TargetRegisterInfo &TRI)
|
||||
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
|
||||
MaxKernArgAlign(MFI.getMaxKernArgAlign()),
|
||||
LDSSize(MFI.getLDSSize()),
|
||||
IsEntryFunction(MFI.isEntryFunction()),
|
||||
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
|
||||
MemoryBound(MFI.isMemoryBound()),
|
||||
WaveLimiter(MFI.needsWaveLimiter()),
|
||||
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
|
||||
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
|
||||
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
|
||||
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
|
||||
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
|
||||
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
|
||||
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
|
||||
Mode(MFI.getMode()) {}
|
||||
const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI)
|
||||
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
|
||||
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
|
||||
DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
|
||||
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
|
||||
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
|
||||
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
|
||||
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
|
||||
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
|
||||
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
|
||||
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
|
||||
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
|
||||
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
|
||||
}
|
||||
|
||||
void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
|
||||
MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
|
||||
|
@ -564,6 +561,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
|
|||
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
|
||||
MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
|
||||
LDSSize = YamlMFI.LDSSize;
|
||||
DynLDSAlign = YamlMFI.DynLDSAlign;
|
||||
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
|
||||
IsEntryFunction = YamlMFI.IsEntryFunction;
|
||||
NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
|
||||
|
|
|
@ -277,6 +277,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
|
|||
uint64_t ExplicitKernArgSize = 0;
|
||||
unsigned MaxKernArgAlign = 0;
|
||||
unsigned LDSSize = 0;
|
||||
Align DynLDSAlign;
|
||||
bool IsEntryFunction = false;
|
||||
bool NoSignedZerosFPMath = false;
|
||||
bool MemoryBound = false;
|
||||
|
@ -306,6 +307,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
|
|||
UINT64_C(0));
|
||||
YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
|
||||
YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
|
||||
YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
|
||||
YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
|
||||
YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
|
||||
YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
@lds0 = addrspace(3) global [512 x float] undef
|
||||
@lds1 = addrspace(3) global [256 x float] undef
|
||||
@lds2 = addrspace(3) global [4096 x float] undef
|
||||
@lds3 = addrspace(3) global [67 x i8] undef
|
||||
|
||||
@dynamic_shared0 = external addrspace(3) global [0 x float]
|
||||
@dynamic_shared1 = external addrspace(3) global [0 x double]
|
||||
@dynamic_shared2 = external addrspace(3) global [0 x double], align 4
|
||||
@dynamic_shared3 = external addrspace(3) global [0 x double], align 16
|
||||
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_0:
|
||||
; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x
|
||||
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val0, float addrspace(3)* %arrayidx1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_1:
|
||||
; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}}
|
||||
; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}}
|
||||
; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0xc00, [[IDX]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) {
|
||||
entry:
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%idx.0 = add nsw i32 %tid.x, 64
|
||||
%tmp = icmp eq i32 %cond, 0
|
||||
br i1 %tmp, label %if, label %else
|
||||
|
||||
if: ; preds = %entry
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
|
||||
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
|
||||
br label %endif
|
||||
|
||||
else: ; preds = %entry
|
||||
%arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
|
||||
%val1 = load float, float addrspace(3)* %arrayidx1, align 4
|
||||
br label %endif
|
||||
|
||||
endif: ; preds = %else, %if
|
||||
%val = phi float [ %val0, %if ], [ %val1, %else ]
|
||||
%arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val, float addrspace(3)* %arrayidx, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_2:
|
||||
; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x4000, [[IDX]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx
|
||||
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val0, float addrspace(3)* %arrayidx1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; The offset to the dynamic shared memory array should be aligned on the type
|
||||
; specified.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_3:
|
||||
; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; The offset to the dynamic shared memory array should be aligned on the
|
||||
; maximal one.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_4:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
|
||||
; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%val2 = uitofp i8 %val0 to double
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x
|
||||
store double %val2, double addrspace(3)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Honor the explicit alignment from the specified variable.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_5:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
|
||||
; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%val2 = uitofp i8 %val0 to double
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x
|
||||
store double %val2, double addrspace(3)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Honor the explicit alignment from the specified variable.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_6:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
|
||||
; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%val2 = uitofp i8 %val0 to double
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x
|
||||
store double %val2, double addrspace(3)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
@ -0,0 +1,138 @@
|
|||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
@lds0 = addrspace(3) global [512 x float] undef
|
||||
@lds1 = addrspace(3) global [256 x float] undef
|
||||
@lds2 = addrspace(3) global [4096 x float] undef
|
||||
@lds3 = addrspace(3) global [67 x i8] undef
|
||||
|
||||
@dynamic_shared0 = external addrspace(3) global [0 x float]
|
||||
@dynamic_shared1 = external addrspace(3) global [0 x double]
|
||||
@dynamic_shared2 = external addrspace(3) global [0 x double], align 4
|
||||
@dynamic_shared3 = external addrspace(3) global [0 x double], align 16
|
||||
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_0:
|
||||
; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x
|
||||
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val0, float addrspace(3)* %arrayidx1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_1:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
|
||||
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) {
|
||||
entry:
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%idx.0 = add nsw i32 %tid.x, 64
|
||||
%tmp = icmp eq i32 %cond, 0
|
||||
br i1 %tmp, label %if, label %else
|
||||
|
||||
if: ; preds = %entry
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
|
||||
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
|
||||
br label %endif
|
||||
|
||||
else: ; preds = %entry
|
||||
%arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
|
||||
%val1 = load float, float addrspace(3)* %arrayidx1, align 4
|
||||
br label %endif
|
||||
|
||||
endif: ; preds = %else, %if
|
||||
%val = phi float [ %val0, %if ], [ %val1, %else ]
|
||||
%arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val, float addrspace(3)* %arrayidx, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_2:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000
|
||||
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx
|
||||
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val0, float addrspace(3)* %arrayidx1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; The offset to the dynamic shared memory array should be aligned on the type
|
||||
; specified.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_3:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
|
||||
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; The offset to the dynamic shared memory array should be aligned on the
|
||||
; maximal one.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_4:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
|
||||
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
|
||||
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%val2 = uitofp i8 %val0 to double
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x
|
||||
store double %val2, double addrspace(3)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Honor the explicit alignment from the specified variable.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_5:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
|
||||
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
|
||||
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%val2 = uitofp i8 %val0 to double
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x
|
||||
store double %val2, double addrspace(3)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Honor the explicit alignment from the specified variable.
|
||||
; CHECK-LABEL: {{^}}dynamic_shared_array_6:
|
||||
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
|
||||
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
|
||||
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
|
||||
define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vidx = add i32 %tid.x, %idx
|
||||
%arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
|
||||
%val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
|
||||
%val1 = uitofp i8 %val0 to float
|
||||
%val2 = uitofp i8 %val0 to double
|
||||
%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
|
||||
store float %val1, float addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x
|
||||
store double %val2, double addrspace(3)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
@ -0,0 +1,14 @@
|
|||
# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
|
||||
|
||||
---
|
||||
# CHECK: error: YAML:8:16: must be a power of two
|
||||
|
||||
name: dyn_lds_with_alignment
|
||||
machineFunctionInfo:
|
||||
dynLDSAlign: 9
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
|
@ -8,6 +8,7 @@
|
|||
# FULL-NEXT: explicitKernArgSize: 128
|
||||
# FULL-NEXT: maxKernArgAlign: 64
|
||||
# FULL-NEXT: ldsSize: 2048
|
||||
# FULL-NEXT: dynLDSAlign: 1
|
||||
# FULL-NEXT: isEntryFunction: true
|
||||
# FULL-NEXT: noSignedZerosFPMath: false
|
||||
# FULL-NEXT: memoryBound: true
|
||||
|
@ -81,6 +82,7 @@ body: |
|
|||
# FULL-NEXT: explicitKernArgSize: 0
|
||||
# FULL-NEXT: maxKernArgAlign: 1
|
||||
# FULL-NEXT: ldsSize: 0
|
||||
# FULL-NEXT: dynLDSAlign: 1
|
||||
# FULL-NEXT: isEntryFunction: false
|
||||
# FULL-NEXT: noSignedZerosFPMath: false
|
||||
# FULL-NEXT: memoryBound: false
|
||||
|
@ -121,6 +123,7 @@ body: |
|
|||
# FULL-NEXT: explicitKernArgSize: 0
|
||||
# FULL-NEXT: maxKernArgAlign: 1
|
||||
# FULL-NEXT: ldsSize: 0
|
||||
# FULL-NEXT: dynLDSAlign: 1
|
||||
# FULL-NEXT: isEntryFunction: false
|
||||
# FULL-NEXT: noSignedZerosFPMath: false
|
||||
# FULL-NEXT: memoryBound: false
|
||||
|
@ -162,6 +165,7 @@ body: |
|
|||
# FULL-NEXT: explicitKernArgSize: 0
|
||||
# FULL-NEXT: maxKernArgAlign: 1
|
||||
# FULL-NEXT: ldsSize: 0
|
||||
# FULL-NEXT: dynLDSAlign: 1
|
||||
# FULL-NEXT: isEntryFunction: true
|
||||
# FULL-NEXT: noSignedZerosFPMath: false
|
||||
# FULL-NEXT: memoryBound: false
|
||||
|
@ -285,3 +289,20 @@ body: |
|
|||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
# ALL-LABEL: name: dyn_lds_with_alignment
|
||||
|
||||
# FULL: ldsSize: 0
|
||||
# FULL-NEXT: dynLDSAlign: 8
|
||||
|
||||
# SIMPLE: dynLDSAlign: 8
|
||||
name: dyn_lds_with_alignment
|
||||
machineFunctionInfo:
|
||||
dynLDSAlign: 8
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
; CHECK-NEXT: explicitKernArgSize: 128
|
||||
; CHECK-NEXT: maxKernArgAlign: 64
|
||||
; CHECK-NEXT: ldsSize: 0
|
||||
; CHECK-NEXT: dynLDSAlign: 1
|
||||
; CHECK-NEXT: isEntryFunction: true
|
||||
; CHECK-NEXT: noSignedZerosFPMath: false
|
||||
; CHECK-NEXT: memoryBound: false
|
||||
|
@ -46,6 +47,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
|||
; CHECK-NEXT: explicitKernArgSize: 0
|
||||
; CHECK-NEXT: maxKernArgAlign: 1
|
||||
; CHECK-NEXT: ldsSize: 0
|
||||
; CHECK-NEXT: dynLDSAlign: 1
|
||||
; CHECK-NEXT: isEntryFunction: true
|
||||
; CHECK-NEXT: noSignedZerosFPMath: false
|
||||
; CHECK-NEXT: memoryBound: false
|
||||
|
@ -76,6 +78,7 @@ define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
|
|||
; CHECK-NEXT: explicitKernArgSize: 0
|
||||
; CHECK-NEXT: maxKernArgAlign: 1
|
||||
; CHECK-NEXT: ldsSize: 0
|
||||
; CHECK-NEXT: dynLDSAlign: 1
|
||||
; CHECK-NEXT: isEntryFunction: false
|
||||
; CHECK-NEXT: noSignedZerosFPMath: false
|
||||
; CHECK-NEXT: memoryBound: false
|
||||
|
@ -105,6 +108,7 @@ define void @function() {
|
|||
; CHECK-NEXT: explicitKernArgSize: 0
|
||||
; CHECK-NEXT: maxKernArgAlign: 1
|
||||
; CHECK-NEXT: ldsSize: 0
|
||||
; CHECK-NEXT: dynLDSAlign: 1
|
||||
; CHECK-NEXT: isEntryFunction: false
|
||||
; CHECK-NEXT: noSignedZerosFPMath: true
|
||||
; CHECK-NEXT: memoryBound: false
|
||||
|
|
Loading…
Reference in New Issue