2012-12-12 05:25:42 +08:00
|
|
|
//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
|
|
|
|
#include "SIMachineFunctionInfo.h"
|
2014-09-24 09:33:17 +08:00
|
|
|
#include "AMDGPUSubtarget.h"
|
2014-05-02 23:41:42 +08:00
|
|
|
#include "SIInstrInfo.h"
|
2014-09-24 09:33:17 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2014-08-22 04:40:54 +08:00
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
2013-11-28 05:23:35 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2014-05-02 23:41:42 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2013-11-28 05:23:35 +08:00
|
|
|
|
|
|
|
#define MAX_LANES 64
|
2012-12-12 05:25:42 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2013-11-19 08:57:56 +08:00
|
|
|
|
|
|
|
// Pin the vtable to this file.
|
|
|
|
void SIMachineFunctionInfo::anchor() {}
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
2013-04-02 05:47:53 +08:00
|
|
|
: AMDGPUMachineFunction(MF),
|
2014-09-24 09:33:17 +08:00
|
|
|
TIDReg(AMDGPU::NoRegister),
|
2015-11-26 04:55:12 +08:00
|
|
|
ScratchRSrcReg(AMDGPU::NoRegister),
|
2015-12-01 05:16:03 +08:00
|
|
|
ScratchWaveOffsetReg(AMDGPU::NoRegister),
|
|
|
|
PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
|
|
|
|
DispatchPtrUserSGPR(AMDGPU::NoRegister),
|
|
|
|
QueuePtrUserSGPR(AMDGPU::NoRegister),
|
|
|
|
KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
|
|
|
|
DispatchIDUserSGPR(AMDGPU::NoRegister),
|
|
|
|
FlatScratchInitUserSGPR(AMDGPU::NoRegister),
|
|
|
|
PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
|
|
|
|
GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
|
|
|
|
GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
|
|
|
|
GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
|
|
|
|
WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
|
|
|
|
WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
|
|
|
|
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
|
|
|
|
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
|
|
|
|
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
|
2013-11-28 05:23:35 +08:00
|
|
|
PSInputAddr(0),
|
2016-01-14 01:23:09 +08:00
|
|
|
ReturnsVoid(true),
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
MaximumWorkGroupSize(0),
|
2016-01-13 19:45:36 +08:00
|
|
|
LDSWaveSpillSize(0),
|
|
|
|
PSInputEna(0),
|
2014-09-24 09:33:17 +08:00
|
|
|
NumUserSGPRs(0),
|
2015-12-01 05:16:03 +08:00
|
|
|
NumSystemSGPRs(0),
|
2015-11-26 04:55:12 +08:00
|
|
|
HasSpilledSGPRs(false),
|
|
|
|
HasSpilledVGPRs(false),
|
2016-02-12 14:31:30 +08:00
|
|
|
HasNonSpillStackObjects(false),
|
|
|
|
HasFlatInstructions(false),
|
2015-12-01 05:16:03 +08:00
|
|
|
PrivateSegmentBuffer(false),
|
2015-11-26 04:55:12 +08:00
|
|
|
DispatchPtr(false),
|
|
|
|
QueuePtr(false),
|
|
|
|
DispatchID(false),
|
2015-12-01 05:16:03 +08:00
|
|
|
KernargSegmentPtr(false),
|
2015-11-26 04:55:12 +08:00
|
|
|
FlatScratchInit(false),
|
|
|
|
GridWorkgroupCountX(false),
|
|
|
|
GridWorkgroupCountY(false),
|
|
|
|
GridWorkgroupCountZ(false),
|
2016-04-15 00:27:03 +08:00
|
|
|
WorkGroupIDX(false),
|
2015-11-26 04:55:12 +08:00
|
|
|
WorkGroupIDY(false),
|
|
|
|
WorkGroupIDZ(false),
|
|
|
|
WorkGroupInfo(false),
|
2015-12-01 05:16:03 +08:00
|
|
|
PrivateSegmentWaveByteOffset(false),
|
2016-04-15 00:27:03 +08:00
|
|
|
WorkItemIDX(false),
|
2015-11-26 04:55:12 +08:00
|
|
|
WorkItemIDY(false),
|
|
|
|
WorkItemIDZ(false) {
|
2015-12-01 05:16:03 +08:00
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
2015-11-26 04:55:12 +08:00
|
|
|
const Function *F = MF.getFunction();
|
|
|
|
|
2016-01-13 19:45:36 +08:00
|
|
|
PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
|
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
|
|
|
|
|
2016-04-15 00:27:03 +08:00
|
|
|
if (!AMDGPU::isShader(F->getCallingConv())) {
|
2015-12-01 05:16:03 +08:00
|
|
|
KernargSegmentPtr = true;
|
2016-04-15 00:27:03 +08:00
|
|
|
WorkGroupIDX = true;
|
|
|
|
WorkItemIDX = true;
|
|
|
|
}
|
2015-11-26 04:55:12 +08:00
|
|
|
|
|
|
|
if (F->hasFnAttribute("amdgpu-work-group-id-y"))
|
|
|
|
WorkGroupIDY = true;
|
|
|
|
|
|
|
|
if (F->hasFnAttribute("amdgpu-work-group-id-z"))
|
|
|
|
WorkGroupIDZ = true;
|
|
|
|
|
|
|
|
if (F->hasFnAttribute("amdgpu-work-item-id-y"))
|
|
|
|
WorkItemIDY = true;
|
|
|
|
|
|
|
|
if (F->hasFnAttribute("amdgpu-work-item-id-z"))
|
|
|
|
WorkItemIDZ = true;
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2016-02-12 14:31:30 +08:00
|
|
|
// X, XY, and XYZ are the only supported combinations, so make sure Y is
|
|
|
|
// enabled if Z is.
|
|
|
|
if (WorkItemIDZ)
|
|
|
|
WorkItemIDY = true;
|
|
|
|
|
2016-04-07 03:40:20 +08:00
|
|
|
bool MaySpill = ST.isVGPRSpillingEnabled(*F);
|
2015-12-01 05:16:03 +08:00
|
|
|
bool HasStackObjects = FrameInfo->hasStackObjects();
|
|
|
|
|
|
|
|
if (HasStackObjects || MaySpill)
|
|
|
|
PrivateSegmentWaveByteOffset = true;
|
|
|
|
|
|
|
|
if (ST.isAmdHsaOS()) {
|
|
|
|
if (HasStackObjects || MaySpill)
|
|
|
|
PrivateSegmentBuffer = true;
|
|
|
|
|
|
|
|
if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
|
|
|
|
DispatchPtr = true;
|
|
|
|
}
|
|
|
|
|
2016-02-12 14:31:30 +08:00
|
|
|
// We don't need to worry about accessing spills with flat instructions.
|
|
|
|
// TODO: On VI where we must use flat for global, we should be able to omit
|
|
|
|
// this if it is never used for generic access.
|
|
|
|
if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
|
|
|
|
ST.isAmdHsaOS())
|
|
|
|
FlatScratchInit = true;
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
|
|
|
|
if (AMDGPU::isCompute(F->getCallingConv()))
|
|
|
|
MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
|
|
|
|
else
|
|
|
|
MaximumWorkGroupSize = ST.getWavefrontSize();
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
|
|
|
|
const SIRegisterInfo &TRI) {
|
|
|
|
PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
|
|
|
|
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
|
|
|
|
NumUserSGPRs += 4;
|
|
|
|
return PrivateSegmentBufferUserSGPR;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
|
|
|
|
DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
|
|
|
|
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
|
|
|
NumUserSGPRs += 2;
|
|
|
|
return DispatchPtrUserSGPR;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
|
|
|
|
QueuePtrUserSGPR = TRI.getMatchingSuperReg(
|
|
|
|
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
|
|
|
NumUserSGPRs += 2;
|
|
|
|
return QueuePtrUserSGPR;
|
2015-11-26 04:55:12 +08:00
|
|
|
}
|
2014-08-22 04:40:54 +08:00
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
|
|
|
|
KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
|
|
|
|
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
|
|
|
NumUserSGPRs += 2;
|
|
|
|
return KernargSegmentPtrUserSGPR;
|
2015-12-01 05:15:53 +08:00
|
|
|
}
|
|
|
|
|
2016-02-12 14:31:30 +08:00
|
|
|
unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
|
|
|
|
FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
|
|
|
|
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
|
|
|
NumUserSGPRs += 2;
|
|
|
|
return FlatScratchInitUserSGPR;
|
|
|
|
}
|
|
|
|
|
2014-08-22 04:40:54 +08:00
|
|
|
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
|
|
|
|
MachineFunction *MF,
|
|
|
|
unsigned FrameIndex,
|
|
|
|
unsigned SubIdx) {
|
2016-03-05 02:31:18 +08:00
|
|
|
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
2015-02-19 09:10:55 +08:00
|
|
|
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
|
|
|
|
MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
|
2014-08-22 04:40:54 +08:00
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
|
|
|
|
Offset += SubIdx * 4;
|
|
|
|
|
|
|
|
unsigned LaneVGPRIdx = Offset / (64 * 4);
|
|
|
|
unsigned Lane = (Offset / 4) % 64;
|
|
|
|
|
|
|
|
struct SpilledReg Spill;
|
2016-03-05 02:31:18 +08:00
|
|
|
Spill.Lane = Lane;
|
2014-08-22 04:40:54 +08:00
|
|
|
|
|
|
|
if (!LaneVGPRs.count(LaneVGPRIdx)) {
|
2015-01-14 23:42:31 +08:00
|
|
|
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
|
2016-01-04 23:50:01 +08:00
|
|
|
|
2016-03-05 02:31:18 +08:00
|
|
|
if (LaneVGPR == AMDGPU::NoRegister)
|
|
|
|
// We have no VGPRs left for spilling SGPRs.
|
|
|
|
return Spill;
|
2016-01-04 23:50:01 +08:00
|
|
|
|
|
|
|
|
2014-08-22 04:40:54 +08:00
|
|
|
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
|
|
|
|
|
|
|
|
// Add this register as live-in to all blocks to avoid machine verifer
|
|
|
|
// complaining about use of an undefined physical register.
|
|
|
|
for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
|
|
|
|
BI != BE; ++BI) {
|
|
|
|
BI->addLiveIn(LaneVGPR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
|
|
|
|
return Spill;
|
2013-11-28 05:23:35 +08:00
|
|
|
}
|
2014-09-24 09:33:17 +08:00
|
|
|
|
|
|
|
unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
|
|
|
|
const MachineFunction &MF) const {
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
return MaximumWorkGroupSize;
|
2014-09-24 09:33:17 +08:00
|
|
|
}
|