forked from OSchip/llvm-project
AMDGPU add support for spilling to a user sgpr pointed buffers
Summary: This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1]. Patch By: Dave Airlie Reviewers: nhaehnle, arsenm, tstellarAMD Reviewed By: arsenm Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D25428 llvm-svn: 293000
This commit is contained in:
parent
e04da5dee2
commit
2f3f9855f0
|
@ -100,6 +100,10 @@ def int_amdgcn_dispatch_id :
|
|||
GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
|
||||
Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
|
||||
|
||||
def int_amdgcn_implicit_buffer_ptr :
|
||||
GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
|
||||
Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Instruction Intrinsics
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -140,7 +140,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
|
|||
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
|
||||
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
|
||||
SIProgramInfo KernelInfo;
|
||||
if (STM.isAmdCodeObjectV2()) {
|
||||
if (STM.isAmdCodeObjectV2(*MF)) {
|
||||
getSIProgramInfo(KernelInfo, *MF);
|
||||
EmitAmdKernelCodeT(*MF, KernelInfo);
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
|
|||
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
|
||||
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
|
||||
if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
|
||||
if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
|
||||
AMDGPUTargetStreamer *TS =
|
||||
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
|
||||
SmallString<128> SymbolName;
|
||||
|
@ -779,7 +779,7 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
|
|||
|
||||
// FIXME: Should use getKernArgSize
|
||||
header.kernarg_segment_byte_size =
|
||||
STM.getKernArgSegmentSize(MFI->getABIArgOffset());
|
||||
STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
|
||||
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
|
||||
header.workitem_vgpr_count = KernelInfo.NumVGPR;
|
||||
header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
|
||||
|
|
|
@ -299,8 +299,9 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
|
|||
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
|
||||
}
|
||||
|
||||
unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
|
||||
unsigned ImplicitBytes = getImplicitArgNumBytes();
|
||||
unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
|
||||
unsigned ExplicitArgBytes) const {
|
||||
unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
|
||||
if (ImplicitBytes == 0)
|
||||
return ExplicitArgBytes;
|
||||
|
||||
|
|
|
@ -313,22 +313,31 @@ public:
|
|||
return EnableXNACK;
|
||||
}
|
||||
|
||||
bool isAmdCodeObjectV2() const {
|
||||
return isAmdHsaOS() || isMesa3DOS();
|
||||
bool isMesaKernel(const MachineFunction &MF) const {
|
||||
return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
|
||||
}
|
||||
|
||||
// Covers VS/PS/CS graphics shaders
|
||||
bool isMesaGfxShader(const MachineFunction &MF) const {
|
||||
return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
|
||||
}
|
||||
|
||||
bool isAmdCodeObjectV2(const MachineFunction &MF) const {
|
||||
return isAmdHsaOS() || isMesaKernel(MF);
|
||||
}
|
||||
|
||||
/// \brief Returns the offset in bytes from the start of the input buffer
|
||||
/// of the first explicit kernel argument.
|
||||
unsigned getExplicitKernelArgOffset() const {
|
||||
return isAmdCodeObjectV2() ? 0 : 36;
|
||||
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
|
||||
return isAmdCodeObjectV2(MF) ? 0 : 36;
|
||||
}
|
||||
|
||||
unsigned getAlignmentForImplicitArgPtr() const {
|
||||
return isAmdHsaOS() ? 8 : 4;
|
||||
}
|
||||
|
||||
unsigned getImplicitArgNumBytes() const {
|
||||
if (isMesa3DOS())
|
||||
unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
|
||||
if (isMesaKernel(MF))
|
||||
return 16;
|
||||
if (isAmdHsaOS() && isOpenCLEnv())
|
||||
return 32;
|
||||
|
@ -595,7 +604,7 @@ public:
|
|||
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
|
||||
}
|
||||
|
||||
unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
|
||||
unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
|
||||
|
||||
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
|
||||
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
|
||||
|
|
|
@ -1587,7 +1587,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
|
|||
|
||||
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
|
||||
unsigned PartOffset = VA.getLocMemOffset();
|
||||
unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
|
||||
unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
|
||||
|
||||
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
|
||||
SDValue Arg = DAG.getLoad(
|
||||
|
|
|
@ -237,7 +237,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
|
||||
|
||||
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
|
||||
if (ST.isAmdCodeObjectV2()) {
|
||||
if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
|
||||
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
|
||||
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
}
|
||||
|
@ -255,7 +255,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
}
|
||||
|
||||
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
|
||||
assert(ST.isAmdCodeObjectV2());
|
||||
assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
|
||||
MRI.addLiveIn(PreloadedPrivateBufferReg);
|
||||
MBB.addLiveIn(PreloadedPrivateBufferReg);
|
||||
}
|
||||
|
@ -280,6 +280,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
|
||||
bool CopyBuffer = ResourceRegUsed &&
|
||||
PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
|
||||
ST.isAmdCodeObjectV2(MF) &&
|
||||
ScratchRsrcReg != PreloadedPrivateBufferReg;
|
||||
|
||||
// This needs to be careful of the copying order to avoid overwriting one of
|
||||
|
@ -303,24 +304,57 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
|
||||
}
|
||||
|
||||
if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
|
||||
assert(!ST.isAmdCodeObjectV2());
|
||||
if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
|
||||
assert(!ST.isAmdCodeObjectV2(MF));
|
||||
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
|
||||
|
||||
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
|
||||
unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
|
||||
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
|
||||
unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
|
||||
|
||||
// Use relocations to get the pointer, and setup the other bits manually.
|
||||
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
|
||||
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
|
||||
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
|
||||
BuildMI(MBB, I, DL, SMovB32, Rsrc1)
|
||||
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
if (MFI->hasPrivateMemoryInputPtr()) {
|
||||
unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
|
||||
|
||||
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
|
||||
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
|
||||
|
||||
BuildMI(MBB, I, DL, Mov64, Rsrc01)
|
||||
.addReg(PreloadedPrivateBufferReg)
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
} else {
|
||||
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
|
||||
|
||||
PointerType *PtrTy =
|
||||
PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
|
||||
AMDGPUAS::CONSTANT_ADDRESS);
|
||||
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
|
||||
auto MMO = MF.getMachineMemOperand(PtrInfo,
|
||||
MachineMemOperand::MOLoad |
|
||||
MachineMemOperand::MOInvariant |
|
||||
MachineMemOperand::MODereferenceable,
|
||||
0, 0);
|
||||
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
|
||||
.addReg(PreloadedPrivateBufferReg)
|
||||
.addImm(0) // offset
|
||||
.addImm(0) // glc
|
||||
.addMemOperand(MMO)
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
}
|
||||
} else {
|
||||
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
|
||||
unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
|
||||
|
||||
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
|
||||
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
|
||||
BuildMI(MBB, I, DL, SMovB32, Rsrc1)
|
||||
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
|
||||
}
|
||||
|
||||
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
|
||||
.addImm(Rsrc23 & 0xffffffff)
|
||||
|
|
|
@ -891,7 +891,7 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
if (!AMDGPU::isShader(CallConv)) {
|
||||
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
|
||||
} else {
|
||||
assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
|
||||
assert(!Info->hasDispatchPtr() &&
|
||||
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
|
||||
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
|
||||
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
|
||||
|
@ -899,6 +899,12 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
!Info->hasWorkItemIDZ());
|
||||
}
|
||||
|
||||
if (Info->hasPrivateMemoryInputPtr()) {
|
||||
unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
|
||||
MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
|
||||
CCInfo.AllocateReg(PrivateMemoryPtrReg);
|
||||
}
|
||||
|
||||
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
|
||||
if (Info->hasPrivateSegmentBuffer()) {
|
||||
unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
|
||||
|
@ -956,7 +962,7 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
if (VA.isMemLoc()) {
|
||||
VT = Ins[i].VT;
|
||||
EVT MemVT = VA.getLocVT();
|
||||
const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
|
||||
const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
|
||||
VA.getLocMemOffset();
|
||||
// The first 36 bytes of the input buffer contains information about
|
||||
// thread group and global sizes.
|
||||
|
@ -1080,7 +1086,7 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
|
||||
HasStackObjects = true;
|
||||
|
||||
if (ST.isAmdCodeObjectV2()) {
|
||||
if (ST.isAmdCodeObjectV2(MF)) {
|
||||
if (HasStackObjects) {
|
||||
// If we have stack objects, we unquestionably need the private buffer
|
||||
// resource. For the Code Object V2 ABI, this will be the first 4 user
|
||||
|
@ -2504,9 +2510,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
// TODO: Should this propagate fast-math-flags?
|
||||
|
||||
switch (IntrinsicID) {
|
||||
case Intrinsic::amdgcn_implicit_buffer_ptr: {
|
||||
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
|
||||
}
|
||||
case Intrinsic::amdgcn_dispatch_ptr:
|
||||
case Intrinsic::amdgcn_queue_ptr: {
|
||||
if (!Subtarget->isAmdCodeObjectV2()) {
|
||||
if (!Subtarget->isAmdCodeObjectV2(MF)) {
|
||||
DiagnosticInfoUnsupported BadIntrin(
|
||||
*MF.getFunction(), "unsupported hsa intrinsic without hsa target",
|
||||
DL.getDebugLoc());
|
||||
|
|
|
@ -77,7 +77,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
PrivateSegmentWaveByteOffset(false),
|
||||
WorkItemIDX(false),
|
||||
WorkItemIDY(false),
|
||||
WorkItemIDZ(false) {
|
||||
WorkItemIDZ(false),
|
||||
PrivateMemoryInputPtr(false) {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
const Function *F = MF.getFunction();
|
||||
|
||||
|
@ -114,7 +115,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
if (HasStackObjects || MaySpill)
|
||||
PrivateSegmentWaveByteOffset = true;
|
||||
|
||||
if (ST.isAmdCodeObjectV2()) {
|
||||
if (ST.isAmdCodeObjectV2(MF)) {
|
||||
if (HasStackObjects || MaySpill)
|
||||
PrivateSegmentBuffer = true;
|
||||
|
||||
|
@ -126,6 +127,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
|
||||
if (F->hasFnAttribute("amdgpu-dispatch-id"))
|
||||
DispatchID = true;
|
||||
} else if (ST.isMesaGfxShader(MF)) {
|
||||
if (HasStackObjects || MaySpill)
|
||||
PrivateMemoryInputPtr = true;
|
||||
}
|
||||
|
||||
// We don't need to worry about accessing spills with flat instructions.
|
||||
|
@ -182,6 +186,13 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
|
|||
return FlatScratchInitUserSGPR;
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
|
||||
PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
NumUserSGPRs += 2;
|
||||
return PrivateMemoryPtrUserSGPR;
|
||||
}
|
||||
|
||||
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
|
||||
MachineFunction *MF,
|
||||
unsigned FrameIndex,
|
||||
|
|
|
@ -87,6 +87,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
|
|||
unsigned ScratchRSrcReg;
|
||||
unsigned ScratchWaveOffsetReg;
|
||||
|
||||
// Input registers for non-HSA ABI
|
||||
unsigned PrivateMemoryPtrUserSGPR;
|
||||
|
||||
// Input registers setup for the HSA ABI.
|
||||
// User SGPRs in allocation order.
|
||||
unsigned PrivateSegmentBufferUserSGPR;
|
||||
|
@ -166,6 +169,11 @@ private:
|
|||
bool WorkItemIDY : 1;
|
||||
bool WorkItemIDZ : 1;
|
||||
|
||||
// Private memory buffer
|
||||
// Compute directly in sgpr[0:1]
|
||||
// Other shaders indirect 64-bits at sgpr[0:1]
|
||||
bool PrivateMemoryInputPtr : 1;
|
||||
|
||||
MCPhysReg getNextUserSGPR() const {
|
||||
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
|
||||
return AMDGPU::SGPR0 + NumUserSGPRs;
|
||||
|
@ -204,6 +212,7 @@ public:
|
|||
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
|
||||
unsigned addDispatchID(const SIRegisterInfo &TRI);
|
||||
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
|
||||
unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
|
||||
|
||||
// Add system SGPRs.
|
||||
unsigned addWorkGroupIDX() {
|
||||
|
@ -308,6 +317,10 @@ public:
|
|||
return WorkItemIDZ;
|
||||
}
|
||||
|
||||
bool hasPrivateMemoryInputPtr() const {
|
||||
return PrivateMemoryInputPtr;
|
||||
}
|
||||
|
||||
unsigned getNumUserSGPRs() const {
|
||||
return NumUserSGPRs;
|
||||
}
|
||||
|
@ -344,6 +357,10 @@ public:
|
|||
return QueuePtrUserSGPR;
|
||||
}
|
||||
|
||||
unsigned getPrivateMemoryPtrUserSGPR() const {
|
||||
return PrivateMemoryPtrUserSGPR;
|
||||
}
|
||||
|
||||
bool hasSpilledSGPRs() const {
|
||||
return HasSpilledSGPRs;
|
||||
}
|
||||
|
|
|
@ -1108,10 +1108,12 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
|
|||
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
|
||||
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
|
||||
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
|
||||
assert(ST.isAmdCodeObjectV2() &&
|
||||
"Non-CodeObjectV2 ABI currently uses relocations");
|
||||
assert(MFI->hasPrivateSegmentBuffer());
|
||||
return MFI->PrivateSegmentBufferUserSGPR;
|
||||
if (ST.isAmdCodeObjectV2(MF)) {
|
||||
assert(MFI->hasPrivateSegmentBuffer());
|
||||
return MFI->PrivateSegmentBufferUserSGPR;
|
||||
}
|
||||
assert(MFI->hasPrivateMemoryInputPtr());
|
||||
return MFI->PrivateMemoryPtrUserSGPR;
|
||||
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
|
||||
assert(MFI->hasKernargSegmentPtr());
|
||||
return MFI->KernargSegmentPtrUserSGPR;
|
||||
|
|
Loading…
Reference in New Issue