AMDGPU/SILoadStoreOptimizer: Optimize scanning for mergeable instructions

Summary:
This adds a pre-pass to this optimization that scans through the basic
block and generates lists of mergeable instructions with one list per unique
address.

In the optimization phase instead of scanning through the basic block for mergeable
instructions, we now iterate over the lists generated by the pre-pass.

The decision to re-optimize a block is now made per list, so if we fail to merge any
instructions with the same address, then we do not attempt to optimize them in
future passes over the block.  This will help to reduce the time this pass
spends re-optimizing instructions.

In one pathological test case, this change reduces the time spent in the
SILoadStoreOptimizer from 0.2s to 0.03s.

This restructuring will also make it possible to implement further solutions in
this pass, because we can now add less expensive checks to the pre-pass and
filter instructions out early which will avoid the need to do the expensive
scanning during the optimization pass. For example, checking for adjacent
offsets is an inexpensive test we can move to the pre-pass.

Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65961

llvm-svn: 373630
This commit is contained in:
Tom Stellard 2019-10-03 17:11:47 +00:00
parent 9972c992eb
commit e6f5171305
1 changed files with 185 additions and 82 deletions

View File

@ -161,6 +161,31 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
return true; return true;
} }
bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
for (unsigned i = 0; i < NumAddresses; ++i) {
const MachineOperand *AddrOp = AddrReg[i];
// Immediates are always OK.
if (AddrOp->isImm())
continue;
// Don't try to merge addresses that aren't either immediates or registers.
// TODO: Should be possible to merge FrameIndexes and maybe some other
// non-register
if (!AddrOp->isReg())
return false;
// TODO: We should be able to merge physical reg addreses.
if (Register::isPhysicalRegister(AddrOp->getReg()))
return false;
// If an address has only one use then there will be on other
// instructions with the same address, so we can't merge this one.
if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
return false;
}
return true;
}
void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
const GCNSubtarget &STM); const GCNSubtarget &STM);
void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
@ -220,6 +245,10 @@ private:
bool promoteConstantOffsetToImm(MachineInstr &CI, bool promoteConstantOffsetToImm(MachineInstr &CI,
MemInfoMap &Visited, MemInfoMap &Visited,
SmallPtrSet<MachineInstr *, 4> &Promoted) const; SmallPtrSet<MachineInstr *, 4> &Promoted) const;
void addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const;
bool collectMergeableInsts(MachineBasicBlock &MBB,
std::list<std::list<CombineInfo> > &MergeableInsts) const;
public: public:
static char ID; static char ID;
@ -228,7 +257,11 @@ public:
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
} }
bool optimizeBlock(MachineBasicBlock &MBB); void removeCombinedInst(std::list<CombineInfo> &MergeList,
const MachineInstr &MI);
bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
bool &OptimizeListAgain);
bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
bool runOnMachineFunction(MachineFunction &MF) override; bool runOnMachineFunction(MachineFunction &MF) override;
@ -424,6 +457,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
AddrReg[i] = &I->getOperand(AddrIdx[i]); AddrReg[i] = &I->getOperand(AddrIdx[i]);
} }
InstsToMove.clear();
} }
void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
@ -646,15 +681,6 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
return false; return false;
for (unsigned i = 0; i < CI.NumAddresses; i++) {
// We only ever merge operations with the same base address register, so
// don't bother scanning forward if there are no other uses.
if (CI.AddrReg[i]->isReg() &&
(Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) ||
MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg())))
return false;
}
++MBBI; ++MBBI;
DenseSet<unsigned> RegDefsToMove; DenseSet<unsigned> RegDefsToMove;
@ -827,12 +853,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove); moveInstsAfter(Copy1, CI.InstsToMove);
MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent(); CI.I->eraseFromParent();
CI.Paired->eraseFromParent(); CI.Paired->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
return Next; return Read2;
} }
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
@ -911,12 +936,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
moveInstsAfter(Write2, CI.InstsToMove); moveInstsAfter(Write2, CI.InstsToMove);
MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent(); CI.I->eraseFromParent();
CI.Paired->eraseFromParent(); CI.Paired->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
return Next; return Write2;
} }
MachineBasicBlock::iterator MachineBasicBlock::iterator
@ -938,12 +962,13 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) MachineInstr *New =
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
.addImm(MergedOffset) // offset .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(CI.GLC0) // glc .addImm(MergedOffset) // offset
.addImm(CI.DLC0) // dlc .addImm(CI.GLC0) // glc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); .addImm(CI.DLC0) // dlc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@ -963,10 +988,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove); moveInstsAfter(Copy1, CI.InstsToMove);
MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent(); CI.I->eraseFromParent();
CI.Paired->eraseFromParent(); CI.Paired->eraseFromParent();
return Next; return New;
} }
MachineBasicBlock::iterator MachineBasicBlock::iterator
@ -997,15 +1021,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) MachineInstr *New =
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.addImm(MergedOffset) // offset .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(CI.GLC0) // glc .addImm(MergedOffset) // offset
.addImm(CI.SLC0) // slc .addImm(CI.GLC0) // glc
.addImm(0) // tfe .addImm(CI.SLC0) // slc
.addImm(CI.DLC0) // dlc .addImm(0) // tfe
.addImm(0) // swz .addImm(CI.DLC0) // dlc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); .addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@ -1025,10 +1050,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove); moveInstsAfter(Copy1, CI.InstsToMove);
MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent(); CI.I->eraseFromParent();
CI.Paired->eraseFromParent(); CI.Paired->eraseFromParent();
return Next; return New;
} }
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
@ -1191,22 +1215,22 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) MachineInstr *New =
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(CI.GLC0) // glc .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
.addImm(CI.SLC0) // slc .addImm(CI.GLC0) // glc
.addImm(0) // tfe .addImm(CI.SLC0) // slc
.addImm(CI.DLC0) // dlc .addImm(0) // tfe
.addImm(0) // swz .addImm(CI.DLC0) // dlc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); .addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
moveInstsAfter(MIB, CI.InstsToMove); moveInstsAfter(MIB, CI.InstsToMove);
MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent(); CI.I->eraseFromParent();
CI.Paired->eraseFromParent(); CI.Paired->eraseFromParent();
return Next; return New;
} }
MachineOperand MachineOperand
@ -1519,32 +1543,105 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
return false; return false;
} }
// Scan through looking for adjacent LDS operations with constant offsets from void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
// the same base register. We rely on the scheduler to do the hard work of std::list<std::list<CombineInfo> > &MergeableInsts) const {
// clustering nearby loads, and assume these are all adjacent. for (std::list<CombineInfo> &AddrList : MergeableInsts) {
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { if (AddrList.front().hasSameBaseAddress(*CI.I) &&
bool Modified = false; AddrList.front().InstClass == CI.InstClass) {
AddrList.emplace_back(CI);
return;
}
}
// Base address not found, so add a new list.
MergeableInsts.emplace_back(1, CI);
}
bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
std::list<std::list<CombineInfo> > &MergeableInsts) const {
bool Modified = false;
// Contain the list // Contain the list
MemInfoMap Visited; MemInfoMap Visited;
// Contains the list of instructions for which constant offsets are being // Contains the list of instructions for which constant offsets are being
// promoted to the IMM. // promoted to the IMM.
SmallPtrSet<MachineInstr *, 4> AnchorList; SmallPtrSet<MachineInstr *, 4> AnchorList;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { // Sort potential mergeable instructions into lists. One list per base address.
MachineInstr &MI = *I; for (MachineInstr &MI : MBB.instrs()) {
// We run this before checking if an address is mergeable, because it can produce
// better code even if the instructions aren't mergeable.
if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true; Modified = true;
const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
if (InstClass == UNKNOWN)
continue;
// Don't combine if volatile. // Don't combine if volatile.
if (MI.hasOrderedMemoryRef()) { if (MI.hasOrderedMemoryRef())
++I; continue;
CombineInfo CI;
CI.setMI(MI, *TII, *STM);
if (!CI.hasMergeableAddress(*MRI))
continue;
addInstToMergeableList(CI, MergeableInsts);
}
return Modified;
}
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
bool SILoadStoreOptimizer::optimizeBlock(
std::list<std::list<CombineInfo> > &MergeableInsts) {
bool Modified = false;
for (std::list<CombineInfo> &MergeList : MergeableInsts) {
if (MergeList.size() < 2)
continue;
bool OptimizeListAgain = false;
if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
// We weren't able to make any changes, so clear the list so we don't
// process the same instructions the next time we try to optimize this
// block.
MergeList.clear();
continue; continue;
} }
CombineInfo CI; // We made changes, but also determined that there were no more optimization
CI.setMI(I, *TII, *STM); // opportunities, so we don't need to reprocess the list
if (!OptimizeListAgain)
MergeList.clear();
OptimizeAgain |= OptimizeListAgain;
Modified = true;
}
return Modified;
}
void
SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
const MachineInstr &MI) {
for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
if (&*CI->I == &MI) {
MergeList.erase(CI);
return;
}
}
}
bool
SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
std::list<CombineInfo> &MergeList,
bool &OptimizeListAgain) {
bool Modified = false;
for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
CombineInfo &CI = *I;
switch (CI.InstClass) { switch (CI.InstClass) {
default: default:
@ -1552,55 +1649,57 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
case DS_READ: case DS_READ:
if (findMatchingInst(CI)) { if (findMatchingInst(CI)) {
Modified = true; Modified = true;
I = mergeRead2Pair(CI); removeCombinedInst(MergeList, *CI.Paired);
} else { MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
++I; CI.setMI(NewMI, *TII, *STM);
} }
continue; break;
case DS_WRITE: case DS_WRITE:
if (findMatchingInst(CI)) { if (findMatchingInst(CI)) {
Modified = true; Modified = true;
I = mergeWrite2Pair(CI); removeCombinedInst(MergeList, *CI.Paired);
} else { MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
++I; CI.setMI(NewMI, *TII, *STM);
} }
continue; break;
case S_BUFFER_LOAD_IMM: case S_BUFFER_LOAD_IMM:
if (findMatchingInst(CI)) { if (findMatchingInst(CI)) {
Modified = true; Modified = true;
I = mergeSBufferLoadImmPair(CI); removeCombinedInst(MergeList, *CI.Paired);
OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
} else { CI.setMI(NewMI, *TII, *STM);
++I; OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
} }
continue; break;
case BUFFER_LOAD_OFFEN: case BUFFER_LOAD_OFFEN:
case BUFFER_LOAD_OFFSET: case BUFFER_LOAD_OFFSET:
case BUFFER_LOAD_OFFEN_exact: case BUFFER_LOAD_OFFEN_exact:
case BUFFER_LOAD_OFFSET_exact: case BUFFER_LOAD_OFFSET_exact:
if (findMatchingInst(CI)) { if (findMatchingInst(CI)) {
Modified = true; Modified = true;
I = mergeBufferLoadPair(CI); removeCombinedInst(MergeList, *CI.Paired);
OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
} else { CI.setMI(NewMI, *TII, *STM);
++I; OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
} }
continue; break;
case BUFFER_STORE_OFFEN: case BUFFER_STORE_OFFEN:
case BUFFER_STORE_OFFSET: case BUFFER_STORE_OFFSET:
case BUFFER_STORE_OFFEN_exact: case BUFFER_STORE_OFFEN_exact:
case BUFFER_STORE_OFFSET_exact: case BUFFER_STORE_OFFSET_exact:
if (findMatchingInst(CI)) { if (findMatchingInst(CI)) {
Modified = true; Modified = true;
I = mergeBufferStorePair(CI); removeCombinedInst(MergeList, *CI.Paired);
OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
} else { CI.setMI(NewMI, *TII, *STM);
++I; OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
} }
continue; break;
} }
// Clear the InstsToMove after we have finished searching so we don't have
++I; // stale values left over if we search for this CI again in another pass
// over the block.
CI.InstsToMove.clear();
} }
return Modified; return Modified;
@ -1626,10 +1725,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
bool Modified = false; bool Modified = false;
for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock &MBB : MF) {
std::list<std::list<CombineInfo> > MergeableInsts;
// First pass: Collect list of all instructions we know how to merge.
Modified |= collectMergeableInsts(MBB, MergeableInsts);
do { do {
OptimizeAgain = false; OptimizeAgain = false;
Modified |= optimizeBlock(MBB); Modified |= optimizeBlock(MergeableInsts);
} while (OptimizeAgain); } while (OptimizeAgain);
} }