forked from OSchip/llvm-project
AMDGPU: Refactor AsmPrinter
Avoid analyzing functions multiple times. This allows asserting that each function is only analyzed once. llvm-svn: 301938
This commit is contained in:
parent
7b82b4bddb
commit
b03dd8daae
|
@ -149,11 +149,9 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
|
|||
return;
|
||||
|
||||
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
|
||||
SIProgramInfo KernelInfo;
|
||||
amd_kernel_code_t KernelCode;
|
||||
if (STM.isAmdCodeObjectV2(*MF)) {
|
||||
getSIProgramInfo(KernelInfo, *MF);
|
||||
getAmdKernelCode(KernelCode, KernelInfo, *MF);
|
||||
getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
|
||||
|
||||
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
|
||||
getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
|
||||
|
@ -187,7 +185,26 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
|
|||
AsmPrinter::EmitGlobalVariable(GV);
|
||||
}
|
||||
|
||||
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
|
||||
CallGraphResourceInfo.clear();
|
||||
return AsmPrinter::doFinalization(M);
|
||||
}
|
||||
|
||||
// Print comments that apply to both callable functions and entry points.
|
||||
void AMDGPUAsmPrinter::emitCommonFunctionComments(
|
||||
uint32_t NumVGPR,
|
||||
uint32_t NumSGPR,
|
||||
uint32_t ScratchSize,
|
||||
uint64_t CodeSize) {
|
||||
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
|
||||
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
|
||||
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
|
||||
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
|
||||
}
|
||||
|
||||
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
||||
CurrentProgramInfo = SIProgramInfo();
|
||||
|
||||
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
|
||||
|
||||
// The starting address of all shader programs must be 256 bytes aligned.
|
||||
|
@ -204,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
|||
OutStreamer->SwitchSection(ConfigSection);
|
||||
}
|
||||
|
||||
SIProgramInfo KernelInfo;
|
||||
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
|
||||
getSIProgramInfo(KernelInfo, MF);
|
||||
if (MFI->isEntryFunction()) {
|
||||
getSIProgramInfo(CurrentProgramInfo, MF);
|
||||
} else {
|
||||
auto I = CallGraphResourceInfo.insert(
|
||||
std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
|
||||
SIFunctionResourceInfo &Info = I.first->second;
|
||||
assert(I.second && "should only be called once per function");
|
||||
Info = analyzeResourceUsage(MF);
|
||||
}
|
||||
|
||||
if (!STM.isAmdHsaOS()) {
|
||||
EmitProgramInfoSI(MF, KernelInfo);
|
||||
EmitProgramInfoSI(MF, CurrentProgramInfo);
|
||||
}
|
||||
} else {
|
||||
EmitProgramInfoR600(MF);
|
||||
|
@ -226,72 +251,87 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
|||
OutStreamer->SwitchSection(CommentSection);
|
||||
|
||||
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
|
||||
if (MFI->isEntryFunction()) {
|
||||
OutStreamer->emitRawComment(" Kernel info:", false);
|
||||
} else {
|
||||
if (!MFI->isEntryFunction()) {
|
||||
OutStreamer->emitRawComment(" Function info:", false);
|
||||
SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
|
||||
emitCommonFunctionComments(
|
||||
Info.NumVGPR,
|
||||
Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
|
||||
Info.PrivateSegmentSize,
|
||||
getFunctionCodeSize(MF));
|
||||
return false;
|
||||
}
|
||||
|
||||
OutStreamer->emitRawComment(" Kernel info:", false);
|
||||
emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
|
||||
CurrentProgramInfo.NumSGPR,
|
||||
CurrentProgramInfo.ScratchSize,
|
||||
getFunctionCodeSize(MF));
|
||||
|
||||
OutStreamer->emitRawComment(" codeLenInByte = " +
|
||||
Twine(getFunctionCodeSize(MF)), false);
|
||||
OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
|
||||
false);
|
||||
OutStreamer->emitRawComment(
|
||||
" NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
|
||||
|
||||
OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
|
||||
" bytes/workgroup (compile time only)", false);
|
||||
OutStreamer->emitRawComment(
|
||||
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
|
||||
" bytes/workgroup (compile time only)", false);
|
||||
|
||||
if (!MFI->isEntryFunction())
|
||||
return false;
|
||||
OutStreamer->emitRawComment(
|
||||
" SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
|
||||
|
||||
OutStreamer->emitRawComment(" SGPRBlocks: " +
|
||||
Twine(KernelInfo.SGPRBlocks), false);
|
||||
OutStreamer->emitRawComment(" VGPRBlocks: " +
|
||||
Twine(KernelInfo.VGPRBlocks), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" NumSGPRsForWavesPerEU: " +
|
||||
Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" NumVGPRsForWavesPerEU: " +
|
||||
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
|
||||
|
||||
OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
|
||||
Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
|
||||
OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
|
||||
Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
|
||||
|
||||
OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
|
||||
false);
|
||||
OutStreamer->emitRawComment(
|
||||
" ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
|
||||
false);
|
||||
OutStreamer->emitRawComment(
|
||||
" ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
|
||||
false);
|
||||
|
||||
if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
|
||||
OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
|
||||
Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
|
||||
OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
|
||||
Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
|
||||
Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" DebuggerPrivateSegmentBufferSGPR: s" +
|
||||
Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
|
||||
}
|
||||
|
||||
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
|
||||
Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
|
||||
Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
|
||||
Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
|
||||
Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
|
||||
Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
|
||||
Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
|
||||
OutStreamer->emitRawComment(
|
||||
" COMPUTE_PGM_RSRC2:USER_SGPR: " +
|
||||
Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
|
||||
Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
|
||||
Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
|
||||
Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
|
||||
Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
|
||||
OutStreamer->emitRawComment(
|
||||
" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
|
||||
Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
|
||||
false);
|
||||
} else {
|
||||
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
||||
OutStreamer->emitRawComment(
|
||||
|
@ -407,71 +447,117 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
|
|||
return false;
|
||||
}
|
||||
|
||||
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
const MachineFunction &MF) const {
|
||||
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const SIInstrInfo *TII = STM.getInstrInfo();
|
||||
const SIRegisterInfo *RI = &TII->getRegisterInfo();
|
||||
|
||||
|
||||
MCPhysReg NumVGPRReg = AMDGPU::NoRegister;
|
||||
for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
|
||||
if (MRI.isPhysRegUsed(Reg)) {
|
||||
NumVGPRReg = Reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
MCPhysReg NumSGPRReg = AMDGPU::NoRegister;
|
||||
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
|
||||
if (MRI.isPhysRegUsed(Reg)) {
|
||||
NumSGPRReg = Reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We found the maximum register index. They start at 0, so add one to get the
|
||||
// number of registers.
|
||||
ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 :
|
||||
RI->getHWRegIndex(NumVGPRReg) + 1;
|
||||
ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 :
|
||||
RI->getHWRegIndex(NumSGPRReg) + 1;
|
||||
static unsigned getNumExtraSGPRs(const SISubtarget &ST,
|
||||
bool VCCUsed,
|
||||
bool FlatScrUsed) {
|
||||
unsigned ExtraSGPRs = 0;
|
||||
|
||||
ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
|
||||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
|
||||
if (ProgInfo.VCCUsed)
|
||||
if (VCCUsed)
|
||||
ExtraSGPRs = 2;
|
||||
|
||||
ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
|
||||
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
|
||||
|
||||
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
|
||||
// instructions aren't used to access the scratch buffer. Inline assembly
|
||||
// may need it though.
|
||||
//
|
||||
// If we only have implicit uses of flat_scr on flat instructions, it is not
|
||||
// really needed.
|
||||
if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() &&
|
||||
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
|
||||
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
|
||||
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
|
||||
ProgInfo.FlatUsed = false;
|
||||
}
|
||||
|
||||
if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
|
||||
if (ProgInfo.FlatUsed)
|
||||
if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
|
||||
if (FlatScrUsed)
|
||||
ExtraSGPRs = 4;
|
||||
} else {
|
||||
if (STM.isXNACKEnabled())
|
||||
if (ST.isXNACKEnabled())
|
||||
ExtraSGPRs = 4;
|
||||
|
||||
if (ProgInfo.FlatUsed)
|
||||
if (FlatScrUsed)
|
||||
ExtraSGPRs = 6;
|
||||
}
|
||||
|
||||
return ExtraSGPRs;
|
||||
}
|
||||
|
||||
int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
|
||||
const SISubtarget &ST) const {
|
||||
return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
|
||||
}
|
||||
|
||||
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
|
||||
const MachineFunction &MF) const {
|
||||
SIFunctionResourceInfo Info;
|
||||
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
||||
|
||||
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
|
||||
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
|
||||
|
||||
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
|
||||
// instructions aren't used to access the scratch buffer. Inline assembly may
|
||||
// need it though.
|
||||
//
|
||||
// If we only have implicit uses of flat_scr on flat instructions, it is not
|
||||
// really needed.
|
||||
if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
|
||||
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
|
||||
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
|
||||
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
|
||||
Info.UsesFlatScratch = false;
|
||||
}
|
||||
|
||||
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
|
||||
Info.PrivateSegmentSize = FrameInfo.getStackSize();
|
||||
|
||||
if (!FrameInfo.hasCalls()) {
|
||||
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
|
||||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
|
||||
|
||||
// If there are no calls, MachineRegisterInfo can tell us the used register
|
||||
// count easily.
|
||||
|
||||
MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
|
||||
for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
|
||||
if (MRI.isPhysRegUsed(Reg)) {
|
||||
HighestVGPRReg = Reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
|
||||
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
|
||||
if (MRI.isPhysRegUsed(Reg)) {
|
||||
HighestSGPRReg = Reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We found the maximum register index. They start at 0, so add one to get the
|
||||
// number of registers.
|
||||
Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
|
||||
TRI.getHWRegIndex(HighestVGPRReg) + 1;
|
||||
Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
|
||||
TRI.getHWRegIndex(HighestSGPRReg) + 1;
|
||||
|
||||
return Info;
|
||||
}
|
||||
|
||||
llvm_unreachable("calls not implemented");
|
||||
}
|
||||
|
||||
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
const MachineFunction &MF) {
|
||||
SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
|
||||
|
||||
ProgInfo.NumVGPR = Info.NumVGPR;
|
||||
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
|
||||
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
|
||||
ProgInfo.VCCUsed = Info.UsesVCC;
|
||||
ProgInfo.FlatUsed = Info.UsesFlatScratch;
|
||||
ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
|
||||
|
||||
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const SIInstrInfo *TII = STM.getInstrInfo();
|
||||
const SIRegisterInfo *RI = &TII->getRegisterInfo();
|
||||
|
||||
unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
|
||||
ProgInfo.VCCUsed,
|
||||
ProgInfo.FlatUsed);
|
||||
unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
|
||||
|
||||
// Check the addressable register limit before we add ExtraSGPRs.
|
||||
|
@ -574,9 +660,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
|||
// Make clamp modifier on NaN input returns 0.
|
||||
ProgInfo.DX10Clamp = STM.enableDX10Clamp();
|
||||
|
||||
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
||||
ProgInfo.ScratchSize = FrameInfo.getStackSize();
|
||||
|
||||
unsigned LDSAlignShift;
|
||||
if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
|
||||
// LDS is allocated in 64 dword blocks.
|
||||
|
@ -646,7 +729,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
|
|||
}
|
||||
|
||||
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
||||
const SIProgramInfo &KernelInfo) {
|
||||
const SIProgramInfo &CurrentProgramInfo) {
|
||||
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
|
||||
|
@ -654,29 +737,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
|||
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
|
||||
OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
|
||||
|
||||
OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
|
||||
OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
|
||||
|
||||
OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
|
||||
OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
|
||||
OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
|
||||
|
||||
OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
|
||||
OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
|
||||
OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
|
||||
|
||||
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
|
||||
// 0" comment but I don't see a corresponding field in the register spec.
|
||||
} else {
|
||||
OutStreamer->EmitIntValue(RsrcReg, 4);
|
||||
OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
|
||||
S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
|
||||
OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
|
||||
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
|
||||
if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
|
||||
OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
|
||||
OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
|
||||
OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
|
||||
OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
|
||||
OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
|
||||
OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
|
||||
OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
|
||||
OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
|
||||
OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
|
||||
|
@ -704,7 +787,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
|
|||
}
|
||||
|
||||
void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
|
||||
const SIProgramInfo &KernelInfo,
|
||||
const SIProgramInfo &CurrentProgramInfo,
|
||||
const MachineFunction &MF) const {
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
|
||||
|
@ -712,10 +795,13 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
|
|||
AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
|
||||
|
||||
Out.compute_pgm_resource_registers =
|
||||
KernelInfo.ComputePGMRSrc1 |
|
||||
(KernelInfo.ComputePGMRSrc2 << 32);
|
||||
CurrentProgramInfo.ComputePGMRSrc1 |
|
||||
(CurrentProgramInfo.ComputePGMRSrc2 << 32);
|
||||
Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
|
||||
|
||||
if (CurrentProgramInfo.DynamicCallStack)
|
||||
Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
|
||||
|
||||
AMD_HSA_BITS_SET(Out.code_properties,
|
||||
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
|
||||
getElementByteSizeValue(STM.getMaxPrivateElementSize()));
|
||||
|
@ -767,12 +853,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
|
|||
// FIXME: Should use getKernArgSize
|
||||
Out.kernarg_segment_byte_size =
|
||||
STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
|
||||
Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
|
||||
Out.workitem_vgpr_count = KernelInfo.NumVGPR;
|
||||
Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
|
||||
Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
|
||||
Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
|
||||
Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
|
||||
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
|
||||
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
|
||||
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
|
||||
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
|
||||
Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
|
||||
Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
|
||||
|
||||
// These alignment values are specified in powers of two, so alignment =
|
||||
// 2^n. The minimum alignment is 2^4 = 16.
|
||||
|
@ -781,9 +867,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
|
|||
|
||||
if (STM.debuggerEmitPrologue()) {
|
||||
Out.debug_wavefront_private_segment_offset_sgpr =
|
||||
KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
|
||||
CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
|
||||
Out.debug_private_segment_buffer_sgpr =
|
||||
KernelInfo.DebuggerPrivateSegmentBufferSGPR;
|
||||
CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -30,9 +30,26 @@ namespace llvm {
|
|||
|
||||
class AMDGPUTargetStreamer;
|
||||
class MCOperand;
|
||||
class SISubtarget;
|
||||
|
||||
class AMDGPUAsmPrinter final : public AsmPrinter {
|
||||
private:
|
||||
// Track resource usage for callee functions.
|
||||
struct SIFunctionResourceInfo {
|
||||
// Track the number of explicitly used VGPRs. Special registers reserved at
|
||||
// the end are tracked separately.
|
||||
int32_t NumVGPR = 0;
|
||||
int32_t NumExplicitSGPR = 0;
|
||||
uint32_t PrivateSegmentSize = 0;
|
||||
bool UsesVCC = false;
|
||||
bool UsesFlatScratch = false;
|
||||
bool HasDynamicallySizedStack = false;
|
||||
bool HasRecursion = false;
|
||||
|
||||
int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
|
||||
};
|
||||
|
||||
// Track resource usage for kernels / entry functions.
|
||||
struct SIProgramInfo {
|
||||
// Fields set in PGM_RSRC1 pm4 packet.
|
||||
uint32_t VGPRBlocks = 0;
|
||||
|
@ -83,14 +100,23 @@ private:
|
|||
uint16_t DebuggerPrivateSegmentBufferSGPR =
|
||||
std::numeric_limits<uint16_t>::max();
|
||||
|
||||
// Whether there is recursion, dynamic allocas, indirect calls or some other
|
||||
// reason there may be statically unknown stack usage.
|
||||
bool DynamicCallStack = false;
|
||||
|
||||
// Bonus information for debugging.
|
||||
bool VCCUsed = false;
|
||||
|
||||
SIProgramInfo() = default;
|
||||
};
|
||||
|
||||
SIProgramInfo CurrentProgramInfo;
|
||||
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
|
||||
|
||||
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
|
||||
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
|
||||
SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
|
||||
|
||||
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
|
||||
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
|
||||
const MachineFunction &MF) const;
|
||||
void findNumUsedRegistersSI(const MachineFunction &MF,
|
||||
|
@ -101,6 +127,10 @@ private:
|
|||
/// can correctly setup the GPU state.
|
||||
void EmitProgramInfoR600(const MachineFunction &MF);
|
||||
void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
|
||||
void emitCommonFunctionComments(uint32_t NumVGPR,
|
||||
uint32_t NumSGPR,
|
||||
uint32_t ScratchSize,
|
||||
uint64_t CodeSize);
|
||||
|
||||
public:
|
||||
explicit AMDGPUAsmPrinter(TargetMachine &TM,
|
||||
|
@ -112,6 +142,7 @@ public:
|
|||
|
||||
AMDGPUTargetStreamer& getTargetStreamer() const;
|
||||
|
||||
bool doFinalization(Module &M) override;
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
/// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
|
||||
|
|
Loading…
Reference in New Issue