AMDGPU/SILoadStoreOptimizer: Optimize scanning for mergeable instructions

Summary: This adds a pre-pass to this optimization that scans through the basic block and generates lists of mergeable instructions with one list per unique address. In the optimization phase instead of scanning through the basic block for mergeable instructions, we now iterate over the lists generated by the pre-pass. The decision to re-optimize a block is now made per list, so if we fail to merge any instructions with the same address, then we do not attempt to optimize them in future passes over the block. This will help to reduce the time this pass spends re-optimizing instructions. In one pathological test case, this change reduces the time spent in the SILoadStoreOptimizer from 0.2s to 0.03s. This restructuring will also make it possible to implement further solutions in this pass, because we can now add less expensive checks to the pre-pass and filter instructions out early which will avoid the need to do the expensive scanning during the optimization pass. For example, checking for adjacent offsets is an inexpensive test we can move to the pre-pass. Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65961 llvm-svn: 373630
2019-10-03 17:11:47 +00:00 · 2019-10-03 17:11:47 +00:00 · e6f5171305
parent 9972c992eb
commit e6f5171305
1 changed files with 185 additions and 82 deletions
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@ -161,6 +161,31 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
      return true;
    }
    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
      for (unsigned i = 0; i < NumAddresses; ++i) {
        const MachineOperand *AddrOp = AddrReg[i];
        // Immediates are always OK.
        if (AddrOp->isImm())
          continue;
        // Don't try to merge addresses that aren't either immediates or registers.
        // TODO: Should be possible to merge FrameIndexes and maybe some other
        // non-register
        if (!AddrOp->isReg())
          return false;
        // TODO: We should be able to merge physical reg addreses.
        if (Register::isPhysicalRegister(AddrOp->getReg()))
          return false;
        // If an address has only one use then there will be on other
        // instructions with the same address, so we can't merge this one.
        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
          return false;
      }
      return true;
    }
    void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
               const GCNSubtarget &STM);
    void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
@ -220,6 +245,10 @@ private:
  bool promoteConstantOffsetToImm(MachineInstr &CI,
                                  MemInfoMap &Visited,
                                  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
  void addInstToMergeableList(const CombineInfo &CI,
                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
  bool collectMergeableInsts(MachineBasicBlock &MBB,
                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
 public:
  static char ID;
@ -228,7 +257,11 @@ public:
    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
  }
-  bool optimizeBlock(MachineBasicBlock &MBB);
+  void removeCombinedInst(std::list<CombineInfo> &MergeList,
                                         const MachineInstr &MI);
  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
                                     bool &OptimizeListAgain);
  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
  bool runOnMachineFunction(MachineFunction &MF) override;
@ -424,6 +457,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
    AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
    AddrReg[i] = &I->getOperand(AddrIdx[i]);
  }
  InstsToMove.clear();
 }
 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
@ -646,15 +681,6 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
    return false;
  for (unsigned i = 0; i < CI.NumAddresses; i++) {
    // We only ever merge operations with the same base address register, so
    // don't bother scanning forward if there are no other uses.
    if (CI.AddrReg[i]->isReg() &&
        (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) ||
         MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg())))
      return false;
  }
  ++MBBI;
  DenseSet<unsigned> RegDefsToMove;
@ -827,12 +853,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
  moveInstsAfter(Copy1, CI.InstsToMove);
  MachineBasicBlock::iterator Next = std::next(CI.I);
  CI.I->eraseFromParent();
  CI.Paired->eraseFromParent();
  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
-  return Next;
+  return Read2;
 }
 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
@ -911,12 +936,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
  moveInstsAfter(Write2, CI.InstsToMove);
  MachineBasicBlock::iterator Next = std::next(CI.I);
  CI.I->eraseFromParent();
  CI.Paired->eraseFromParent();
  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
-  return Next;
+  return Write2;
 }
 MachineBasicBlock::iterator
@ -938,12 +962,13 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
-  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+  MachineInstr *New =
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+    BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
-      .addImm(MergedOffset) // offset
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
-      .addImm(CI.GLC0)      // glc
+        .addImm(MergedOffset) // offset
-      .addImm(CI.DLC0)      // dlc
+        .addImm(CI.GLC0)      // glc
-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+        .addImm(CI.DLC0)      // dlc
        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@ -963,10 +988,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
  moveInstsAfter(Copy1, CI.InstsToMove);
  MachineBasicBlock::iterator Next = std::next(CI.I);
  CI.I->eraseFromParent();
  CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 MachineBasicBlock::iterator
@ -997,15 +1021,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+  MachineInstr *New =
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-      .addImm(MergedOffset) // offset
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-      .addImm(CI.GLC0)      // glc
+        .addImm(MergedOffset) // offset
-      .addImm(CI.SLC0)      // slc
+        .addImm(CI.GLC0)      // glc
-      .addImm(0)            // tfe
+        .addImm(CI.SLC0)      // slc
-      .addImm(CI.DLC0)      // dlc
+        .addImm(0)            // tfe
-      .addImm(0)            // swz
+        .addImm(CI.DLC0)      // dlc
-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+        .addImm(0)            // swz
        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@ -1025,10 +1050,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
  moveInstsAfter(Copy1, CI.InstsToMove);
  MachineBasicBlock::iterator Next = std::next(CI.I);
  CI.I->eraseFromParent();
  CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
@ -1191,22 +1215,22 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+  MachineInstr *New =
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-      .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-      .addImm(CI.GLC0)      // glc
+        .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
-      .addImm(CI.SLC0)      // slc
+        .addImm(CI.GLC0)      // glc
-      .addImm(0)            // tfe
+        .addImm(CI.SLC0)      // slc
-      .addImm(CI.DLC0)      // dlc
+        .addImm(0)            // tfe
-      .addImm(0)            // swz
+        .addImm(CI.DLC0)      // dlc
-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+        .addImm(0)            // swz
        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  moveInstsAfter(MIB, CI.InstsToMove);
  MachineBasicBlock::iterator Next = std::next(CI.I);
  CI.I->eraseFromParent();
  CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 MachineOperand
@ -1519,32 +1543,105 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
  return false;
 }
-// Scan through looking for adjacent LDS operations with constant offsets from
+void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
-// the same base register. We rely on the scheduler to do the hard work of
+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
-// clustering nearby loads, and assume these are all adjacent.
+  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+    if (AddrList.front().hasSameBaseAddress(*CI.I) &&
-  bool Modified = false;
+        AddrList.front().InstClass == CI.InstClass) {
      AddrList.emplace_back(CI);
      return;
    }
  }
  // Base address not found, so add a new list.
  MergeableInsts.emplace_back(1, CI);
 }
 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
  bool Modified = false;
  // Contain the list
  MemInfoMap Visited;
  // Contains the list of instructions for which constant offsets are being
  // promoted to the IMM.
  SmallPtrSet<MachineInstr *, 4> AnchorList;
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+  // Sort potential mergeable instructions into lists.  One list per base address.
-    MachineInstr &MI = *I;
+  for (MachineInstr &MI : MBB.instrs()) {
-
+    // We run this before checking if an address is mergeable, because it can produce
    // better code even if the instructions aren't mergeable.
    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
      Modified = true;
    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
    if (InstClass == UNKNOWN)
      continue;
    // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef()) {
+    if (MI.hasOrderedMemoryRef())
-      ++I;
+      continue;
    CombineInfo CI;
    CI.setMI(MI, *TII, *STM);
    if (!CI.hasMergeableAddress(*MRI))
      continue;
    addInstToMergeableList(CI, MergeableInsts);
  }
  return Modified;
 }
 // Scan through looking for adjacent LDS operations with constant offsets from
 // the same base register. We rely on the scheduler to do the hard work of
 // clustering nearby loads, and assume these are all adjacent.
 bool SILoadStoreOptimizer::optimizeBlock(
                       std::list<std::list<CombineInfo> > &MergeableInsts) {
  bool Modified = false;
  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
    if (MergeList.size() < 2)
      continue;
    bool OptimizeListAgain = false;
    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
      // We weren't able to make any changes, so clear the list so we don't
      // process the same instructions the next time we try to optimize this
      // block.
      MergeList.clear();
      continue;
    }
-    CombineInfo CI;
+    // We made changes, but also determined that there were no more optimization
-    CI.setMI(I, *TII, *STM);
+    // opportunities, so we don't need to reprocess the list
    if (!OptimizeListAgain)
      MergeList.clear();
    OptimizeAgain |= OptimizeListAgain;
    Modified = true;
  }
  return Modified;
 }
 void
 SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
                                         const MachineInstr &MI) {
  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
    if (&*CI->I == &MI) {
      MergeList.erase(CI);
      return;
    }
  }
 }
 bool
 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
                                          std::list<CombineInfo> &MergeList,
                                          bool &OptimizeListAgain) {
  bool Modified = false;
  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
    CombineInfo &CI = *I;
    switch (CI.InstClass) {
    default:
@ -1552,55 +1649,57 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
    case DS_READ:
      if (findMatchingInst(CI)) {
        Modified = true;
-        I = mergeRead2Pair(CI);
+        removeCombinedInst(MergeList, *CI.Paired);
-      } else {
+        MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
-        ++I;
+        CI.setMI(NewMI, *TII, *STM);
      }
-      continue;
+      break;
    case DS_WRITE:
      if (findMatchingInst(CI)) {
        Modified = true;
-        I = mergeWrite2Pair(CI);
+        removeCombinedInst(MergeList, *CI.Paired);
-      } else {
+        MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
-        ++I;
+        CI.setMI(NewMI, *TII, *STM);
      }
-      continue;
+      break;
    case S_BUFFER_LOAD_IMM:
      if (findMatchingInst(CI)) {
        Modified = true;
-        I = mergeSBufferLoadImmPair(CI);
+        removeCombinedInst(MergeList, *CI.Paired);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
+        MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
-      } else {
+        CI.setMI(NewMI, *TII, *STM);
-        ++I;
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
      }
-      continue;
+      break;
    case BUFFER_LOAD_OFFEN:
    case BUFFER_LOAD_OFFSET:
    case BUFFER_LOAD_OFFEN_exact:
    case BUFFER_LOAD_OFFSET_exact:
      if (findMatchingInst(CI)) {
        Modified = true;
-        I = mergeBufferLoadPair(CI);
+        removeCombinedInst(MergeList, *CI.Paired);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
+        MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
-      } else {
+        CI.setMI(NewMI, *TII, *STM);
-        ++I;
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
      }
-      continue;
+      break;
    case BUFFER_STORE_OFFEN:
    case BUFFER_STORE_OFFSET:
    case BUFFER_STORE_OFFEN_exact:
    case BUFFER_STORE_OFFSET_exact:
      if (findMatchingInst(CI)) {
        Modified = true;
-        I = mergeBufferStorePair(CI);
+        removeCombinedInst(MergeList, *CI.Paired);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
+        MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
-      } else {
+        CI.setMI(NewMI, *TII, *STM);
-        ++I;
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
      }
-      continue;
+      break;
    }
-
+    // Clear the InstsToMove after we have finished searching so we don't have
-    ++I;
+    // stale values left over if we search for this CI again in another pass
    // over the block.
    CI.InstsToMove.clear();
  }
  return Modified;
@ -1626,10 +1725,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
  bool Modified = false;
  for (MachineBasicBlock &MBB : MF) {
    std::list<std::list<CombineInfo> > MergeableInsts;
    // First pass: Collect list of all instructions we know how to merge.
    Modified |= collectMergeableInsts(MBB, MergeableInsts);
    do {
      OptimizeAgain = false;
-      Modified |= optimizeBlock(MBB);
+      Modified |= optimizeBlock(MergeableInsts);
    } while (OptimizeAgain);
  }