Revert "[AMDGPU] SIFixSGPRCopies refactoring"

Breaks ASan tests. This reverts commit 3f8ae7efa8.
2022-08-10 11:31:23 -07:00 · 2022-08-10 11:31:23 -07:00 · 8ea1cf3111
parent 23ace05e0a
commit 8ea1cf3111
3 changed files with 276 additions and 236 deletions
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@ -120,10 +120,6 @@ public:

 class SIFixSGPRCopies : public MachineFunctionPass {
  MachineDominatorTree *MDT;
-  SmallVector<MachineInstr*, 4> SCCCopies;
-  SmallVector<MachineInstr*, 4> RegSequences;
-  SmallVector<MachineInstr*, 4> PHINodes;
-  SmallVector<MachineInstr*, 4> S2VCopies;
  unsigned NextVGPRToSGPRCopyID;
  DenseMap<unsigned, V2SCopyInfo> V2SCopies;
  DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
@ -138,11 +134,8 @@ public:
  SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}

  bool runOnMachineFunction(MachineFunction &MF) override;
-  void fixSCCCopies(bool IsWave32);
-  void prepareRegSequenceAndPHIs(MachineFunction &MF);
  unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
-  bool needToBeConvertedToVALU(V2SCopyInfo *I);
-  void analyzeVGPRToSGPRCopy(MachineInstr *MI);
+  void analyzeVGPRToSGPRCopy(V2SCopyInfo& Info);
  void lowerVGPR2SGPRCopies(MachineFunction &MF);
  // Handles copies which source register is:
  // 1. Physical register
@ -178,6 +171,19 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() {
  return new SIFixSGPRCopies();
 }

+static bool hasVectorOperands(const MachineInstr &MI,
+                              const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+
+    if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg())))
+      return true;
+  }
+  return false;
+}
+
 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
 getCopyRegClasses(const MachineInstr &Copy,
                  const SIRegisterInfo &TRI,
@ -610,6 +616,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
  TII = ST.getInstrInfo();
  MDT = &getAnalysis<MachineDominatorTree>();

+  // We have to lower VGPR to SGPR copies before the main loop
+  // because the REG_SEQUENCE and PHI lowering in main loop
+  // convert the def-use chains to VALU and close the opportunities
+  // for keeping them scalar.
+  // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
+  // addresses their lowering and unify the processing in one main loop.
+  lowerVGPR2SGPRCopies(MF);
+
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {
    MachineBasicBlock *MBB = &*BI;
@ -625,66 +639,100 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
      case AMDGPU::STRICT_WQM:
      case AMDGPU::SOFT_WQM:
      case AMDGPU::STRICT_WWM: {
-        Register SrcReg = MI.getOperand(1).getReg();
        Register DstReg = MI.getOperand(0).getReg();
        const TargetRegisterClass *SrcRC, *DstRC;
        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);

-        if (MI.isCopy() && (SrcReg == AMDGPU::SCC || DstReg == AMDGPU::SCC))
-          SCCCopies.push_back(&MI);
-
-        if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
-          // Since VGPR to SGPR copies affect VGPR to SGPR copy
-          // score and, hence the lowering decision, let's try to get rid of
-          // them as early as possible
-          if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII))
+        if (MI.isCopy()) {
+          Register SrcReg = MI.getOperand(1).getReg();
+          if (SrcReg == AMDGPU::SCC) {
+            Register SCCCopy = MRI->createVirtualRegister(
+                TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
+            I = BuildMI(*MI.getParent(),
+                        std::next(MachineBasicBlock::iterator(MI)),
+                        MI.getDebugLoc(),
+                        TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+                                               : AMDGPU::S_CSELECT_B64),
+                        SCCCopy)
+                    .addImm(-1)
+                    .addImm(0);
+            I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
+                        TII->get(AMDGPU::COPY), DstReg)
+                    .addReg(SCCCopy);
+            MI.eraseFromParent();
+            continue;
+          } else if (DstReg == AMDGPU::SCC) {
+            unsigned Opcode =
+                ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+            Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
+            Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
+            I = BuildMI(*MI.getParent(),
+                        std::next(MachineBasicBlock::iterator(MI)),
+                        MI.getDebugLoc(), TII->get(Opcode))
+                    .addReg(Tmp, getDefRegState(true))
+                    .addReg(SrcReg)
+                    .addReg(Exec);
+            MI.eraseFromParent();
            continue;
-          // Collect those not changed to try them after VGPR to SGPR copies
-          // lowering as there will be more opportunities.
-          S2VCopies.push_back(&MI);
-        }
-        if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
-          continue;
-        if (lowerSpecialCase(MI))
-          continue;
-
-        analyzeVGPRToSGPRCopy(&MI);
-
-        break;
-      }
-      case AMDGPU::INSERT_SUBREG:
-      case AMDGPU::PHI:
-      case AMDGPU::REG_SEQUENCE: {
-        if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) {
-          for (MachineOperand &MO : MI.operands()) {
-            if (!MO.isReg() || !MO.getReg().isVirtual())
-              continue;
-            const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg());
-            if (TRI->hasVectorRegisters(SrcRC)) {
-              const TargetRegisterClass *DestRC =
-                  TRI->getEquivalentSGPRClass(SrcRC);
-              Register NewDst = MRI->createVirtualRegister(DestRC);
-              MachineBasicBlock *BlockToInsertCopy =
-                  MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB()
-                             : MBB;
-              MachineBasicBlock::iterator PointToInsertCopy =
-                  MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I;
-              MachineInstr *NewCopy =
-                  BuildMI(*BlockToInsertCopy, PointToInsertCopy,
-                          PointToInsertCopy->getDebugLoc(),
-                          TII->get(AMDGPU::COPY), NewDst)
-                      .addReg(MO.getReg());
-              MO.setReg(NewDst);
-              analyzeVGPRToSGPRCopy(NewCopy);
-            }
          }
        }

-        if (MI.isPHI())
-          PHINodes.push_back(&MI);
-        else if (MI.isRegSequence())
-          RegSequences.push_back(&MI);
+        if (!DstReg.isVirtual()) {
+          // If the destination register is a physical register there isn't
+          // really much we can do to fix this.
+          // Some special instructions use M0 as an input. Some even only use
+          // the first lane. Insert a readfirstlane and hope for the best.
+          if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
+            Register TmpReg
+              = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

+            BuildMI(*MBB, MI, MI.getDebugLoc(),
+                    TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+                .add(MI.getOperand(1));
+            MI.getOperand(1).setReg(TmpReg);
+          }
+
+          continue;
+        }
+
+        if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+          tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
+        }
+
+        break;
+      }
+      case AMDGPU::PHI: {
+        processPHINode(MI);
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
+            !hasVectorOperands(MI, TRI)) {
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
+          continue;
+        }
+
+        break;
+      }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVectorRegisters(Src0RC) ||
+             TRI->hasVectorRegisters(Src1RC))) {
+          LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+          MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+          if (NewBB && NewBB != MBB) {
+            MBB = NewBB;
+            E = MBB->end();
+            BI = MachineFunction::iterator(MBB);
+            BE = MF.end();
+          }
+          assert((!NewBB || NewBB == I->getParent()) &&
+                 "moveToVALU did not return the right basic block");
+        }
        break;
      }
      case AMDGPU::V_WRITELANE_B32: {
@ -752,41 +800,11 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
    }
  }

-  lowerVGPR2SGPRCopies(MF);
-
-  // Postprocessing
-  fixSCCCopies(ST.isWave32());
-
-  for (auto MI : S2VCopies) {
-    // Check if it is still valid
-    if (MI->getParent() && MI->isCopy()) {
-      const TargetRegisterClass *SrcRC, *DstRC;
-      std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI);
-      if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
-        tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII);
-    }
-  }
-
-
-  for (auto MI : RegSequences) {
-    // Check if it is still valid
-    if (MI->getParent() && MI->isRegSequence())
-      foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI);
-  }
-
-  for (auto MI : PHINodes) {
-    processPHINode(*MI);
-  }
-
  if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
    hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);

  SiblingPenalty.clear();
  V2SCopies.clear();
-  SCCCopies.clear();
-  RegSequences.clear();
-  PHINodes.clear();
-  S2VCopies.clear();

  return true;
 }
@ -843,29 +861,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
 }

 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
-
-  Register DstReg = MI.getOperand(0).getReg();
  Register SrcReg = MI.getOperand(1).getReg();
-
-  if (!DstReg.isVirtual()) {
-    // If the destination register is a physical register there isn't
-    // really much we can do to fix this.
-    // Some special instructions use M0 as an input. Some even only use
-    // the first lane. Insert a readfirstlane and hope for the best.
-    if (DstReg == AMDGPU::M0 &&
-        TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) {
-      Register TmpReg =
-          MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-
-      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
-              TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
-          .add(MI.getOperand(1));
-      MI.getOperand(1).setReg(TmpReg);
-    }
-
-    return true;
-  }
-
  if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
    TII->moveToVALU(MI, MDT);
    return true;
@ -884,13 +880,9 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
  return false;
 }

-void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
-  Register DstReg = MI->getOperand(0).getReg();
-  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);

-  V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI,
-                      TRI->getRegSizeInBits(*DstRC));

+void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(V2SCopyInfo& Info) {
  SmallVector<MachineInstr *, 8> AnalysisWorklist;
  // Needed because the SSA is not a tree but a graph and may have
  // forks and joins. We should not then go same way twice.
@ -938,52 +930,143 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
      AnalysisWorklist.push_back(U);
    }
  }
-  V2SCopies[Info.ID] = Info;
-}
-
-// The main function that computes the VGPR to SGPR copy score
-// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
-bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
-  if (Info->SChain.empty()) {
-    Info->Score = 0;
-    return true;
-  }
-  Info->Siblings = SiblingPenalty[*std::max_element(
-      Info->SChain.begin(), Info->SChain.end(),
-      [&](MachineInstr *A, MachineInstr *B) -> bool {
-        return SiblingPenalty[A].size() < SiblingPenalty[B].size();
-      })];
-  Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; });
-  // The loop below computes the number of another VGPR to SGPR V2SCopies
-  // which contribute to the current copy SALU chain. We assume that all the
-  // V2SCopies with the same source virtual register will be squashed to one
-  // by regalloc. Also we take care of the V2SCopies of the differnt subregs
-  // of the same register.
-  SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
-  for (auto J : Info->Siblings) {
-    auto InfoIt = V2SCopies.find(J);
-    if (InfoIt != V2SCopies.end()) {
-      MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
-      if (SiblingCopy->isImplicitDef())
-        // the COPY has already been MoveToVALUed
-        continue;
-
-      SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
-                                    SiblingCopy->getOperand(1).getSubReg()));
-    }
-  }
-  Info->SiblingPenalty = SrcRegs.size();
-
-  unsigned Penalty =
-      Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes;
-  unsigned Profit = Info->SChain.size();
-  Info->Score = Penalty > Profit ? 0 : Profit - Penalty;
-  Info->NeedToBeConvertedToVALU = Info->Score < 3;
-  return Info->NeedToBeConvertedToVALU;
 }

 void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {

+  // The main function that computes the VGPR to SGPR copy score
+  // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
+  auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
+    if (I->SChain.empty()) {
+      I->Score = 0;
+      return true;
+    }
+    I->Siblings = SiblingPenalty[*std::max_element(
+        I->SChain.begin(), I->SChain.end(),
+        [&](MachineInstr *A, MachineInstr *B) -> bool {
+          return SiblingPenalty[A].size() < SiblingPenalty[B].size();
+        })];
+    I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
+    // The loop below computes the number of another VGPR to SGPR V2SCopies
+    // which contribute to the current copy SALU chain. We assume that all the
+    // V2SCopies with the same source virtual register will be squashed to one
+    // by regalloc. Also we take care of the V2SCopies of the differnt subregs
+    // of the same register.
+    SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
+    for (auto J : I->Siblings) {
+      auto InfoIt = V2SCopies.find(J);
+      if (InfoIt != V2SCopies.end()) {
+        MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+        if (SiblingCopy->isImplicitDef())
+          // the COPY has already been MoveToVALUed
+          continue;
+
+        SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
+                                      SiblingCopy->getOperand(1).getSubReg()));
+      }
+    }
+    I->SiblingPenalty = SrcRegs.size();
+
+    unsigned Penalty =
+        I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
+    unsigned Profit = I->SChain.size();
+    I->Score = Penalty > Profit ? 0 : Profit - Penalty;
+    I->NeedToBeConvertedToVALU = I->Score < 3;
+    return I->NeedToBeConvertedToVALU;
+  };
+
+  auto needProcessing = [](MachineInstr &MI) -> bool {
+    switch (MI.getOpcode()) {
+    case AMDGPU::COPY:
+    case AMDGPU::WQM:
+    case AMDGPU::STRICT_WQM:
+    case AMDGPU::SOFT_WQM:
+    case AMDGPU::STRICT_WWM:
+    case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::PHI:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  SmallSet<MachineInstr *, 4> OutOfOrderProcessedCopies;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    MachineBasicBlock *MBB = &*BI;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+         ++I) {
+      MachineInstr *MI = &*I;
+      if (!needProcessing(*MI))
+        continue;
+
+      if (MI->isRegSequence() || MI->isPHI()) {
+        MachineBasicBlock::iterator J = I;
+        if (TRI->isSGPRClass(TII->getOpRegClass(*MI, 0))) {
+          for (MachineOperand &MO : MI->operands()) {
+            if (!MO.isReg() || !MO.getReg().isVirtual())
+              continue;
+            const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg());
+            if (TRI->hasVectorRegisters(SrcRC)) {
+              const TargetRegisterClass *DestRC =
+                  TRI->getEquivalentSGPRClass(SrcRC);
+              Register NewDst = MRI->createVirtualRegister(DestRC);
+              MachineBasicBlock *BlockToInsertCopy = MBB;
+              MachineBasicBlock::iterator PointToInsertCopy = I;
+              if (MI->isPHI()) {
+                BlockToInsertCopy =
+                    MI->getOperand(MI->getOperandNo(&MO) + 1).getMBB();
+                PointToInsertCopy =
+                    BlockToInsertCopy->getFirstInstrTerminator();
+              }
+              MachineBasicBlock::iterator NewI =
+                  BuildMI(*BlockToInsertCopy, PointToInsertCopy,
+                          PointToInsertCopy->getDebugLoc(),
+                          TII->get(AMDGPU::COPY), NewDst)
+                      .addReg(MO.getReg());
+              MO.setReg(NewDst);
+              if (!MI->isPHI()) {
+                I = NewI;
+                MI = &*I;
+              } else {
+                // We insert the copy into the basic block that may have been
+                // already processed. Pass it to the analysis explicitly.
+                V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI,
+                               TRI->getRegSizeInBits(*DestRC));
+                analyzeVGPRToSGPRCopy(In);
+                V2SCopies[In.ID] = In;
+                OutOfOrderProcessedCopies.insert(MI);
+              }
+            }
+          }
+        }
+
+        if (J == I)
+          continue;
+      }
+
+      const TargetRegisterClass *SrcRC, *DstRC;
+      std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI);
+
+      if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
+        continue;
+
+      if (lowerSpecialCase(*MI))
+        continue;
+
+      if (OutOfOrderProcessedCopies.contains(MI))
+        continue;
+
+      V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI,
+                     TRI->getRegSizeInBits(*DstRC));
+
+      analyzeVGPRToSGPRCopy(In);
+
+      V2SCopies[In.ID] = In;
+    }
+  }
+
  SmallVector<unsigned, 8> LoweringWorklist;
  for (auto &C : V2SCopies) {
    if (needToBeConvertedToVALU(&C.second))
@ -1059,46 +1142,3 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
    MI->eraseFromParent();
  }
 }
-
-void SIFixSGPRCopies::fixSCCCopies(bool IsWave32) {
-  for (auto MI : SCCCopies) {
-    // May be lowered out
-    if (!MI->getParent())
-      continue;
-    // May already have been lowered.
-    if (!MI->isCopy())
-      continue;
-    Register SrcReg = MI->getOperand(1).getReg();
-    Register DstReg = MI->getOperand(0).getReg();
-    if (SrcReg == AMDGPU::SCC) {
-      Register SCCCopy = MRI->createVirtualRegister(
-          TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
-      MachineBasicBlock::iterator I =
-          BuildMI(*MI->getParent(),
-                  std::next(MachineBasicBlock::iterator(MI)),
-                  MI->getDebugLoc(),
-                  TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
-                                    : AMDGPU::S_CSELECT_B64),
-                  SCCCopy)
-              .addImm(-1)
-              .addImm(0);
-      BuildMI(*MI->getParent(), std::next(I), I->getDebugLoc(),
-              TII->get(AMDGPU::COPY), DstReg)
-          .addReg(SCCCopy);
-      MI->eraseFromParent();
-      continue;
-    }
-
-    if (DstReg == AMDGPU::SCC) {
-      unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-      Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-      Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
-      BuildMI(*MI->getParent(), std::next(MachineBasicBlock::iterator(MI)),
-              MI->getDebugLoc(), TII->get(Opcode))
-          .addReg(Tmp, getDefRegState(true))
-          .addReg(SrcReg)
-          .addReg(Exec);
-      MI->eraseFromParent();
-    }
-  }
-}
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@ -41,9 +41,9 @@ body:             |
    ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
    ; W64-NEXT: {{  $}}
    ; W64-NEXT: .1:
@ -88,9 +88,9 @@ body:             |
    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
    ; W32-NEXT: {{  $}}
    ; W32-NEXT: .1:
@ -160,10 +160,10 @@ body:             |
    ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
-    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
-    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
    ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
    ; W64-NEXT: {{  $}}
    ; W64-NEXT: .1:
@ -207,10 +207,10 @@ body:             |
    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
-    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
-    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
    ; W32-NEXT: {{  $}}
    ; W32-NEXT: .1:
@ -280,10 +280,10 @@ body:             |
    ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
-    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
-    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
+    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
    ; W64-NEXT: {{  $}}
    ; W64-NEXT: .1:
@ -327,10 +327,10 @@ body:             |
    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
-    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
-    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
    ; W32-NEXT: {{  $}}
    ; W32-NEXT: .1:
@ -400,9 +400,9 @@ body:             |
    ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
    ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
    ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
@ -429,9 +429,9 @@ body:             |
    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
    ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
@ -485,9 +485,9 @@ body:             |
    ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
    ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
    ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
@ -513,9 +513,9 @@ body:             |
    ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
    ; W64-NO-ADDR64-NEXT: {{  $}}
    ; W64-NO-ADDR64-NEXT: .1:
@ -560,9 +560,9 @@ body:             |
    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
-    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
-    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
    ; W32-NEXT: {{  $}}
    ; W32-NEXT: .1:
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@ -433,7 +433,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
@ -585,7 +585,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
@ -722,7 +722,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 0x7fffffff
@ -741,7 +741,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
@ -770,7 +770,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11