2021-07-19 21:55:39 +08:00
|
|
|
//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// \brief Analyzes how many registers and other resources are used by
|
|
|
|
/// functions.
|
|
|
|
///
|
|
|
|
/// The results of this analysis are used to fill the register usage, flat
|
|
|
|
/// usage, etc. into hardware registers.
|
|
|
|
///
|
|
|
|
/// The analysis takes callees into account. E.g. if a function A that needs 10
|
|
|
|
/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
|
|
|
|
/// will return 20.
|
|
|
|
/// It is assumed that an indirect call can go into any function except
|
|
|
|
/// hardware-entrypoints. Therefore the register usage of functions with
|
|
|
|
/// indirect calls is estimated as the maximum of all non-entrypoint functions
|
|
|
|
/// in the module.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPUResourceUsageAnalysis.h"
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "GCNSubtarget.h"
|
|
|
|
#include "SIMachineFunctionInfo.h"
|
|
|
|
#include "llvm/Analysis/CallGraph.h"
|
|
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
using namespace llvm::AMDGPU;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "amdgpu-resource-usage"
|
|
|
|
|
|
|
|
char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
|
|
|
|
char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
|
|
|
|
|
|
|
|
// We need to tell the runtime some amount ahead of time if we don't know the
|
|
|
|
// true stack size. Assume a smaller number if this is only due to dynamic /
|
|
|
|
// non-entry block allocas.
|
|
|
|
static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
|
|
|
|
"amdgpu-assume-external-call-stack-size",
|
|
|
|
cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
|
|
|
|
cl::init(16384));
|
|
|
|
|
|
|
|
static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
|
|
|
|
"amdgpu-assume-dynamic-stack-object-size",
|
|
|
|
cl::desc("Assumed extra stack use if there are any "
|
|
|
|
"variable sized objects (in bytes)"),
|
|
|
|
cl::Hidden, cl::init(4096));
|
|
|
|
|
|
|
|
INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
|
|
|
|
"Function register usage analysis", true, true)
|
|
|
|
|
|
|
|
static const Function *getCalleeFunction(const MachineOperand &Op) {
|
|
|
|
if (Op.isImm()) {
|
|
|
|
assert(Op.getImm() == 0);
|
|
|
|
return nullptr;
|
|
|
|
}
|
2021-10-16 06:15:45 +08:00
|
|
|
|
2021-07-19 21:55:39 +08:00
|
|
|
return cast<Function>(Op.getGlobal());
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
|
|
|
|
const SIInstrInfo &TII, unsigned Reg) {
|
|
|
|
for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
|
|
|
|
if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
|
|
|
|
const GCNSubtarget &ST) const {
|
|
|
|
return NumExplicitSGPR +
|
|
|
|
IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
|
|
|
|
ST.getTargetID().isXnackOnOrAny());
|
|
|
|
}
|
|
|
|
|
2021-04-16 04:00:22 +08:00
|
|
|
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
|
|
|
|
const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
|
|
|
|
if (ST.hasGFX90AInsts() && ArgNumAGPR)
|
|
|
|
return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
|
|
|
|
return std::max(ArgNumVGPR, ArgNumAGPR);
|
|
|
|
}
|
|
|
|
|
2021-07-19 21:55:39 +08:00
|
|
|
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
|
|
|
|
const GCNSubtarget &ST) const {
|
2021-04-16 04:00:22 +08:00
|
|
|
return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
|
2021-07-19 21:55:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
|
|
|
|
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
|
|
|
if (!TPC)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const TargetMachine &TM = TPC->getTM<TargetMachine>();
|
|
|
|
bool HasIndirectCall = false;
|
|
|
|
|
|
|
|
for (CallGraphNode *I : SCC) {
|
|
|
|
Function *F = I->getFunction();
|
|
|
|
if (!F || F->isDeclaration())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
MachineModuleInfo &MMI =
|
|
|
|
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
|
|
|
|
MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
|
|
|
|
|
|
|
|
auto CI = CallGraphResourceInfo.insert(
|
|
|
|
std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
|
|
|
|
SIFunctionResourceInfo &Info = CI.first->second;
|
|
|
|
assert(CI.second && "should only be called once per function");
|
|
|
|
Info = analyzeResourceUsage(MF, TM);
|
|
|
|
HasIndirectCall |= Info.HasIndirectCall;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (HasIndirectCall)
|
|
|
|
propagateIndirectCallRegisterUsage();
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
|
|
|
|
AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
|
|
|
|
const MachineFunction &MF, const TargetMachine &TM) const {
|
|
|
|
SIFunctionResourceInfo Info;
|
|
|
|
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
|
|
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
|
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
|
|
|
|
|
|
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
|
|
|
|
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
|
|
|
|
MRI.isLiveIn(MFI->getPreloadedReg(
|
|
|
|
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
|
|
|
|
|
|
|
|
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
|
|
|
|
// instructions aren't used to access the scratch buffer. Inline assembly may
|
|
|
|
// need it though.
|
|
|
|
//
|
|
|
|
// If we only have implicit uses of flat_scr on flat instructions, it is not
|
|
|
|
// really needed.
|
|
|
|
if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
|
|
|
|
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
|
|
|
|
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
|
|
|
|
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
|
|
|
|
Info.UsesFlatScratch = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
Info.PrivateSegmentSize = FrameInfo.getStackSize();
|
|
|
|
|
|
|
|
// Assume a big number if there are any unknown sized objects.
|
|
|
|
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
|
|
|
|
if (Info.HasDynamicallySizedStack)
|
|
|
|
Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
|
|
|
|
|
|
|
|
if (MFI->isStackRealigned())
|
|
|
|
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
|
|
|
|
|
|
|
|
Info.UsesVCC =
|
|
|
|
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
|
|
|
|
|
|
|
|
// If there are no calls, MachineRegisterInfo can tell us the used register
|
|
|
|
// count easily.
|
|
|
|
// A tail call isn't considered a call for MachineFrameInfo's purposes.
|
|
|
|
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
|
|
|
|
MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
|
|
|
|
for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
|
|
|
|
if (MRI.isPhysRegUsed(Reg)) {
|
|
|
|
HighestVGPRReg = Reg;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ST.hasMAIInsts()) {
|
|
|
|
MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
|
|
|
|
for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
|
|
|
|
if (MRI.isPhysRegUsed(Reg)) {
|
|
|
|
HighestAGPRReg = Reg;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
|
|
|
|
? 0
|
|
|
|
: TRI.getHWRegIndex(HighestAGPRReg) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
|
|
|
|
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
|
|
|
|
if (MRI.isPhysRegUsed(Reg)) {
|
|
|
|
HighestSGPRReg = Reg;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We found the maximum register index. They start at 0, so add one to get
|
|
|
|
// the number of registers.
|
|
|
|
Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
|
|
|
|
? 0
|
|
|
|
: TRI.getHWRegIndex(HighestVGPRReg) + 1;
|
|
|
|
Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
|
|
|
|
? 0
|
|
|
|
: TRI.getHWRegIndex(HighestSGPRReg) + 1;
|
|
|
|
|
|
|
|
return Info;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t MaxVGPR = -1;
|
|
|
|
int32_t MaxAGPR = -1;
|
|
|
|
int32_t MaxSGPR = -1;
|
|
|
|
uint64_t CalleeFrameSize = 0;
|
|
|
|
|
|
|
|
for (const MachineBasicBlock &MBB : MF) {
|
|
|
|
for (const MachineInstr &MI : MBB) {
|
|
|
|
// TODO: Check regmasks? Do they occur anywhere except calls?
|
|
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
|
|
unsigned Width = 0;
|
|
|
|
bool IsSGPR = false;
|
|
|
|
bool IsAGPR = false;
|
|
|
|
|
|
|
|
if (!MO.isReg())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Register Reg = MO.getReg();
|
|
|
|
switch (Reg) {
|
|
|
|
case AMDGPU::EXEC:
|
|
|
|
case AMDGPU::EXEC_LO:
|
|
|
|
case AMDGPU::EXEC_HI:
|
|
|
|
case AMDGPU::SCC:
|
|
|
|
case AMDGPU::M0:
|
|
|
|
case AMDGPU::M0_LO16:
|
|
|
|
case AMDGPU::M0_HI16:
|
|
|
|
case AMDGPU::SRC_SHARED_BASE:
|
|
|
|
case AMDGPU::SRC_SHARED_LIMIT:
|
|
|
|
case AMDGPU::SRC_PRIVATE_BASE:
|
|
|
|
case AMDGPU::SRC_PRIVATE_LIMIT:
|
|
|
|
case AMDGPU::SGPR_NULL:
|
|
|
|
case AMDGPU::MODE:
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
|
|
|
|
llvm_unreachable("src_pops_exiting_wave_id should not be used");
|
|
|
|
|
|
|
|
case AMDGPU::NoRegister:
|
|
|
|
assert(MI.isDebugInstr() &&
|
|
|
|
"Instruction uses invalid noreg register");
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case AMDGPU::VCC:
|
|
|
|
case AMDGPU::VCC_LO:
|
|
|
|
case AMDGPU::VCC_HI:
|
|
|
|
case AMDGPU::VCC_LO_LO16:
|
|
|
|
case AMDGPU::VCC_LO_HI16:
|
|
|
|
case AMDGPU::VCC_HI_LO16:
|
|
|
|
case AMDGPU::VCC_HI_HI16:
|
|
|
|
Info.UsesVCC = true;
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case AMDGPU::FLAT_SCR:
|
|
|
|
case AMDGPU::FLAT_SCR_LO:
|
|
|
|
case AMDGPU::FLAT_SCR_HI:
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case AMDGPU::XNACK_MASK:
|
|
|
|
case AMDGPU::XNACK_MASK_LO:
|
|
|
|
case AMDGPU::XNACK_MASK_HI:
|
|
|
|
llvm_unreachable("xnack_mask registers should not be used");
|
|
|
|
|
|
|
|
case AMDGPU::LDS_DIRECT:
|
|
|
|
llvm_unreachable("lds_direct register should not be used");
|
|
|
|
|
|
|
|
case AMDGPU::TBA:
|
|
|
|
case AMDGPU::TBA_LO:
|
|
|
|
case AMDGPU::TBA_HI:
|
|
|
|
case AMDGPU::TMA:
|
|
|
|
case AMDGPU::TMA_LO:
|
|
|
|
case AMDGPU::TMA_HI:
|
|
|
|
llvm_unreachable("trap handler registers should not be used");
|
|
|
|
|
|
|
|
case AMDGPU::SRC_VCCZ:
|
|
|
|
llvm_unreachable("src_vccz register should not be used");
|
|
|
|
|
|
|
|
case AMDGPU::SRC_EXECZ:
|
|
|
|
llvm_unreachable("src_execz register should not be used");
|
|
|
|
|
|
|
|
case AMDGPU::SRC_SCC:
|
|
|
|
llvm_unreachable("src_scc register should not be used");
|
|
|
|
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (AMDGPU::SReg_32RegClass.contains(Reg) ||
|
|
|
|
AMDGPU::SReg_LO16RegClass.contains(Reg) ||
|
|
|
|
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
|
|
|
|
assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
|
|
|
|
"trap handler registers should not be used");
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 1;
|
|
|
|
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
|
|
|
|
AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
|
|
|
|
AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 1;
|
|
|
|
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
|
|
|
|
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 1;
|
|
|
|
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
|
|
|
|
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
|
|
|
|
"trap handler registers should not be used");
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 2;
|
|
|
|
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 2;
|
|
|
|
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 2;
|
|
|
|
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 3;
|
|
|
|
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 3;
|
|
|
|
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 3;
|
|
|
|
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
|
|
|
|
assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
|
|
|
|
"trap handler registers should not be used");
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 4;
|
|
|
|
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 4;
|
|
|
|
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 4;
|
|
|
|
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 5;
|
|
|
|
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 5;
|
|
|
|
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 5;
|
|
|
|
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 6;
|
|
|
|
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 6;
|
|
|
|
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 6;
|
|
|
|
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 7;
|
|
|
|
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 7;
|
|
|
|
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 7;
|
|
|
|
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
|
|
|
|
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
|
|
|
|
"trap handler registers should not be used");
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 8;
|
|
|
|
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 8;
|
|
|
|
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 8;
|
|
|
|
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
|
|
|
|
assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
|
|
|
|
"trap handler registers should not be used");
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 16;
|
|
|
|
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 16;
|
|
|
|
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 16;
|
|
|
|
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = true;
|
|
|
|
Width = 32;
|
|
|
|
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
Width = 32;
|
|
|
|
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
|
|
|
|
IsSGPR = false;
|
|
|
|
IsAGPR = true;
|
|
|
|
Width = 32;
|
|
|
|
} else {
|
|
|
|
llvm_unreachable("Unknown register class");
|
|
|
|
}
|
|
|
|
unsigned HWReg = TRI.getHWRegIndex(Reg);
|
|
|
|
int MaxUsed = HWReg + Width - 1;
|
|
|
|
if (IsSGPR) {
|
|
|
|
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
|
|
|
|
} else if (IsAGPR) {
|
|
|
|
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
|
|
|
|
} else {
|
|
|
|
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MI.isCall()) {
|
|
|
|
// Pseudo used just to encode the underlying global. Is there a better
|
|
|
|
// way to track this?
|
|
|
|
|
|
|
|
const MachineOperand *CalleeOp =
|
|
|
|
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
|
|
|
|
|
|
|
|
const Function *Callee = getCalleeFunction(*CalleeOp);
|
|
|
|
DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
|
|
|
|
CallGraphResourceInfo.end();
|
|
|
|
|
|
|
|
// Avoid crashing on undefined behavior with an illegal call to a
|
|
|
|
// kernel. If a callsite's calling convention doesn't match the
|
|
|
|
// function's, it's undefined behavior. If the callsite calling
|
|
|
|
// convention does match, that would have errored earlier.
|
|
|
|
if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
|
|
|
|
report_fatal_error("invalid call to entry function");
|
|
|
|
|
|
|
|
bool IsIndirect = !Callee || Callee->isDeclaration();
|
|
|
|
if (!IsIndirect)
|
|
|
|
I = CallGraphResourceInfo.find(Callee);
|
|
|
|
|
|
|
|
if (IsIndirect || I == CallGraphResourceInfo.end()) {
|
|
|
|
CalleeFrameSize =
|
|
|
|
std::max(CalleeFrameSize,
|
|
|
|
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
|
|
|
|
|
|
|
|
// Register usage of indirect calls gets handled later
|
|
|
|
Info.UsesVCC = true;
|
|
|
|
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
|
|
|
|
Info.HasDynamicallySizedStack = true;
|
|
|
|
Info.HasIndirectCall = true;
|
|
|
|
} else {
|
|
|
|
// We force CodeGen to run in SCC order, so the callee's register
|
|
|
|
// usage etc. should be the cumulative usage of all callees.
|
|
|
|
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
|
|
|
|
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
|
|
|
|
MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
|
|
|
|
CalleeFrameSize =
|
|
|
|
std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
|
|
|
|
Info.UsesVCC |= I->second.UsesVCC;
|
|
|
|
Info.UsesFlatScratch |= I->second.UsesFlatScratch;
|
|
|
|
Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
|
|
|
|
Info.HasRecursion |= I->second.HasRecursion;
|
|
|
|
Info.HasIndirectCall |= I->second.HasIndirectCall;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Call site could have norecurse on it
|
|
|
|
if (!Callee || !Callee->doesNotRecurse())
|
|
|
|
Info.HasRecursion = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Info.NumExplicitSGPR = MaxSGPR + 1;
|
|
|
|
Info.NumVGPR = MaxVGPR + 1;
|
|
|
|
Info.NumAGPR = MaxAGPR + 1;
|
|
|
|
Info.PrivateSegmentSize += CalleeFrameSize;
|
|
|
|
|
|
|
|
return Info;
|
|
|
|
}
|
|
|
|
|
|
|
|
void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
|
|
|
|
// Collect the maximum number of registers from non-hardware-entrypoints.
|
|
|
|
// All these functions are potential targets for indirect calls.
|
|
|
|
int32_t NonKernelMaxSGPRs = 0;
|
|
|
|
int32_t NonKernelMaxVGPRs = 0;
|
|
|
|
int32_t NonKernelMaxAGPRs = 0;
|
|
|
|
|
|
|
|
for (const auto &I : CallGraphResourceInfo) {
|
|
|
|
if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
|
|
|
|
auto &Info = I.getSecond();
|
|
|
|
NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
|
|
|
|
NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
|
|
|
|
NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add register usage for functions with indirect calls.
|
|
|
|
// For calls to unknown functions, we assume the maximum register usage of
|
|
|
|
// all non-hardware-entrypoints in the current module.
|
|
|
|
for (auto &I : CallGraphResourceInfo) {
|
|
|
|
auto &Info = I.getSecond();
|
|
|
|
if (Info.HasIndirectCall) {
|
|
|
|
Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
|
|
|
|
Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
|
|
|
|
Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|