forked from OSchip/llvm-project
AMDGPU: Make SIInsertWaits about a factor of 4 faster
This was the slowest target custom pass and was spending 80% of the time in getMinimalPhysRegClass which was called for every register operand. Try to use the statically known register class when possible from the instruction's MCOperandInfo. There are a few pseudo instructions which are not well behaved with unknown register classes which still require the expensive physical register class search. There are a few other possibilities for making this even faster, such as not inspecting implicit operands. For now those are checked because it is technically possible to have a scalar load into exec or vcc which can be implicitly used. llvm-svn: 249079
This commit is contained in:
parent
fc64fae6e3
commit
d1d499aa56
|
@ -91,7 +91,8 @@ private:
|
|||
bool isOpRelevant(MachineOperand &Op);
|
||||
|
||||
/// \brief Get register interval an operand affects.
|
||||
RegInterval getRegInterval(MachineOperand &Op);
|
||||
RegInterval getRegInterval(const TargetRegisterClass *RC,
|
||||
const MachineOperand &Reg) const;
|
||||
|
||||
/// \brief Handle instructions async components
|
||||
void pushInstruction(MachineBasicBlock &MBB,
|
||||
|
@ -142,8 +143,7 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
|
|||
}
|
||||
|
||||
Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
|
||||
|
||||
uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
|
||||
uint64_t TSFlags = MI.getDesc().TSFlags;
|
||||
Counters Result = { { 0, 0, 0 } };
|
||||
|
||||
Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
|
||||
|
@ -161,10 +161,9 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
|
|||
MachineOperand &Op = MI.getOperand(0);
|
||||
assert(Op.isReg() && "First LGKM operand must be a register!");
|
||||
|
||||
unsigned Reg = Op.getReg();
|
||||
|
||||
// XXX - What if this is a write into a super register?
|
||||
unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
|
||||
const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
|
||||
unsigned Size = RC->getSize();
|
||||
Result.Named.LGKM = Size > 4 ? 2 : 1;
|
||||
} else {
|
||||
// s_dcache_inv etc. do not have a a destination register. Assume we
|
||||
|
@ -185,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
|
|||
}
|
||||
|
||||
bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
|
||||
|
||||
// Constants are always irrelevant
|
||||
if (!Op.isReg())
|
||||
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
|
||||
return false;
|
||||
|
||||
// Defines are always relevant
|
||||
|
@ -236,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
|
|||
return false;
|
||||
}
|
||||
|
||||
RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
|
||||
|
||||
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
|
||||
return std::make_pair(0, 0);
|
||||
|
||||
unsigned Reg = Op.getReg();
|
||||
unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
|
||||
|
||||
RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
|
||||
const MachineOperand &Reg) const {
|
||||
unsigned Size = RC->getSize();
|
||||
assert(Size >= 4);
|
||||
|
||||
RegInterval Result;
|
||||
Result.first = TRI->getEncodingValue(Reg);
|
||||
Result.first = TRI->getEncodingValue(Reg.getReg());
|
||||
Result.second = Result.first + Size / 4;
|
||||
|
||||
return Result;
|
||||
|
@ -305,12 +298,12 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
|
|||
}
|
||||
|
||||
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
|
||||
|
||||
MachineOperand &Op = I->getOperand(i);
|
||||
if (!isOpRelevant(Op))
|
||||
continue;
|
||||
|
||||
RegInterval Interval = getRegInterval(Op);
|
||||
const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
|
||||
RegInterval Interval = getRegInterval(RC, Op);
|
||||
for (unsigned j = Interval.first; j < Interval.second; ++j) {
|
||||
|
||||
// Remember which registers we define
|
||||
|
@ -405,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
|
|||
if (MI.getOpcode() == AMDGPU::S_SENDMSG)
|
||||
return LastIssued;
|
||||
|
||||
// For each register affected by this
|
||||
// instruction increase the result sequence
|
||||
// For each register affected by this instruction increase the result
|
||||
// sequence.
|
||||
//
|
||||
// TODO: We could probably just look at explicit operands if we removed VCC /
|
||||
// EXEC from SMRD dest reg classes.
|
||||
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
|
||||
|
||||
MachineOperand &Op = MI.getOperand(i);
|
||||
RegInterval Interval = getRegInterval(Op);
|
||||
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
|
||||
continue;
|
||||
|
||||
const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
|
||||
RegInterval Interval = getRegInterval(RC, Op);
|
||||
for (unsigned j = Interval.first; j < Interval.second; ++j) {
|
||||
|
||||
if (Op.isDef()) {
|
||||
|
|
|
@ -326,6 +326,8 @@ unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
|
|||
return getEncodingValue(Reg) & 0xff;
|
||||
}
|
||||
|
||||
// FIXME: This is very slow. It might be worth creating a map from physreg to
|
||||
// register class.
|
||||
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
|
||||
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
|
||||
|
||||
|
|
Loading…
Reference in New Issue