[amdgpu] Handle the case where there is no scavenged register.

- When an unconditional branch is expanded into an indirect branch, if there is no scavenged register, an SGPR pair needs spilling to enable the destination PC calculation. In addition, before jumping into the destination, that clobbered SGPR pair need restoring. - As SGPR cannot be spilled to or restored from memory directly, the spilling/restoring of that SGPR pair reuses the regular SGPR spilling support but without spilling it into memory. As that spilling and restoring points are fully controlled, we only need to spill that SGPR into the temporary VGPR, which needs spilling into its emergency slot. - The target-specific hook is revised to take additional restore block, where the restoring code is filled. After that, the relaxation will place that restore block directly before the destination block and insert an unconditional branch in any fall-through block into the destination block. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D106449
2021-07-16 12:14:49 -04:00 · 2021-07-16 12:14:49 -04:00 · e6a4ba3aa6
parent 32d45862fc
commit e6a4ba3aa6
11 changed files with 1922 additions and 86 deletions
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@ -582,15 +582,14 @@ public:
  }

  /// Insert an unconditional indirect branch at the end of \p MBB to \p
-  /// NewDestBB.  \p BrOffset indicates the offset of \p NewDestBB relative to
+  /// NewDestBB. Optionally, insert the clobbered register restoring in \p
+  /// RestoreBB. \p BrOffset indicates the offset of \p NewDestBB relative to
  /// the offset of the position to insert the new branch.
-  ///
-  /// \returns The number of bytes added to the block.
-  virtual unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                        MachineBasicBlock &NewDestBB,
-                                        const DebugLoc &DL,
-                                        int64_t BrOffset = 0,
-                                        RegScavenger *RS = nullptr) const {
+  virtual void insertIndirectBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock &NewDestBB,
+                                    MachineBasicBlock &RestoreBB,
+                                    const DebugLoc &DL, int64_t BrOffset = 0,
+                                    RegScavenger *RS = nullptr) const {
    llvm_unreachable("target did not implement");
  }

--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@ -463,10 +463,48 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {

  DebugLoc DL = MI.getDebugLoc();
  MI.eraseFromParent();
-  BlockInfo[BranchBB->getNumber()].Size += TII->insertIndirectBranch(
-    *BranchBB, *DestBB, DL, DestOffset - SrcOffset, RS.get());

+  // Create the optional restore block and, initially, place it at the end of
+  // function. That block will be placed later if it's used; otherwise, it will
+  // be erased.
+  MachineBasicBlock *RestoreBB = createNewBlockAfter(MF->back());
+
+  TII->insertIndirectBranch(*BranchBB, *DestBB, *RestoreBB, DL,
+                            DestOffset - SrcOffset, RS.get());
+
+  BlockInfo[BranchBB->getNumber()].Size = computeBlockSize(*BranchBB);
  adjustBlockOffsets(*MBB);
+
+  // If RestoreBB is required, try to place just before DestBB.
+  if (!RestoreBB->empty()) {
+    // TODO: For multiple far branches to the same destination, there are
+    // chances that some restore blocks could be shared if they clobber the
+    // same registers and share the same restore sequence. So far, those
+    // restore blocks are just duplicated for each far branch.
+    assert(!DestBB->isEntryBlock());
+    MachineBasicBlock *PrevBB = &*std::prev(DestBB->getIterator());
+    if (auto *FT = PrevBB->getFallThrough()) {
+      assert(FT == DestBB);
+      TII->insertUnconditionalBranch(*PrevBB, DestBB, DebugLoc());
+      // Recalculate the block size.
+      BlockInfo[PrevBB->getNumber()].Size = computeBlockSize(*PrevBB);
+    }
+    // Now, RestoreBB could be placed directly before DestBB.
+    MF->splice(DestBB->getIterator(), RestoreBB->getIterator());
+    // Update successors and predecessors.
+    RestoreBB->addSuccessor(DestBB);
+    BranchBB->replaceSuccessor(DestBB, RestoreBB);
+    if (TRI->trackLivenessAfterRegAlloc(*MF))
+      computeAndAddLiveIns(LiveRegs, *RestoreBB);
+    // Compute the restore block size.
+    BlockInfo[RestoreBB->getNumber()].Size = computeBlockSize(*RestoreBB);
+    // Update the offset starting from the previous block.
+    adjustBlockOffsets(*PrevBB);
+  } else {
+    // Remove restore block if it's not required.
+    MF->erase(RestoreBB);
+  }
+
  return true;
 }

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -2223,15 +2223,17 @@ MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
  return MI.getOperand(0).getMBB();
 }

-unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
-                                           MachineBasicBlock &DestBB,
-                                           const DebugLoc &DL,
-                                           int64_t BrOffset,
-                                           RegScavenger *RS) const {
+void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                       MachineBasicBlock &DestBB,
+                                       MachineBasicBlock &RestoreBB,
+                                       const DebugLoc &DL, int64_t BrOffset,
+                                       RegScavenger *RS) const {
  assert(RS && "RegScavenger required for long branching");
  assert(MBB.empty() &&
         "new block should be inserted for expanding unconditional branch");
  assert(MBB.pred_size() == 1);
+  assert(RestoreBB.empty() &&
+         "restore block should be inserted for restoring clobbered registers");

  MachineFunction *MF = MBB.getParent();
  MachineRegisterInfo &MRI = MF->getRegInfo();
@ -2268,14 +2270,6 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
    .addReg(PCReg);

-  auto ComputeBlockSize = [](const TargetInstrInfo *TII,
-                             const MachineBasicBlock &MBB) {
-    unsigned Size = 0;
-    for (const MachineInstr &MI : MBB)
-      Size += TII->getInstSizeInBytes(MI);
-    return Size;
-  };
-
  // FIXME: If spilling is necessary, this will fail because this scavenger has
  // no emergency stack slots. It is non-trivial to spill in this situation,
  // because the restore code needs to be specially placed after the
@ -2314,22 +2308,34 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,

  RS->enterBasicBlockEnd(MBB);
  Register Scav = RS->scavengeRegisterBackwards(
-    AMDGPU::SReg_64RegClass,
-    MachineBasicBlock::iterator(GetPC), false, 0);
-  MRI.replaceRegWith(PCReg, Scav);
-  MRI.clearVirtRegs();
-  RS->setRegUsed(Scav);
+      AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
+      /* RestoreAfter */ false, 0, /* AllowSpill */ false);
+  if (Scav) {
+    RS->setRegUsed(Scav);
+    MRI.replaceRegWith(PCReg, Scav);
+    MRI.clearVirtRegs();
+  } else {
+    // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
+    // SGPR spill.
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
+    MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
+    MRI.clearVirtRegs();
+  }

+  MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
  // Now, the distance could be defined.
  auto *Offset = MCBinaryExpr::createSub(
-      MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+      MCSymbolRefExpr::create(DestLabel, MCCtx),
      MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
  // Add offset assignments.
  auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
  OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
  auto *ShAmt = MCConstantExpr::create(32, MCCtx);
  OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
-  return ComputeBlockSize(this, MBB);
+
+  return;
 }

 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@ -275,11 +275,10 @@ public:

  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;

-  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                MachineBasicBlock &NewDestBB,
-                                const DebugLoc &DL,
-                                int64_t BrOffset,
-                                RegScavenger *RS = nullptr) const override;
+  void insertIndirectBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock &NewDestBB,
+                            MachineBasicBlock &RestoreBB, const DebugLoc &DL,
+                            int64_t BrOffset, RegScavenger *RS) const override;

  bool analyzeBranchImpl(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator I,
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -97,7 +97,7 @@ struct SGPRSpillBuilder {
  unsigned EltSize = 4;

  RegScavenger *RS;
-  MachineBasicBlock &MBB;
+  MachineBasicBlock *MBB;
  MachineFunction &MF;
  SIMachineFunctionInfo &MFI;
  const SIInstrInfo &TII;
@ -110,9 +110,14 @@ struct SGPRSpillBuilder {
  SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
                   bool IsWave32, MachineBasicBlock::iterator MI, int Index,
                   RegScavenger *RS)
-      : SuperReg(MI->getOperand(0).getReg()), MI(MI),
-        IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index),
-        RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()),
+      : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
+                         MI->getOperand(0).isKill(), Index, RS) {}
+
+  SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
+                   bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
+                   bool IsKill, int Index, RegScavenger *RS)
+      : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
+        Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
        MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
        IsWave32(IsWave32) {
    const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg);
@ -189,8 +194,9 @@ struct SGPRSpillBuilder {
    if (SavedExecReg) {
      RS->setRegUsed(SavedExecReg);
      // Set exec to needed lanes
-      BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
-      auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
+      BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
+      auto I =
+          BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
      if (!TmpVGPRLive)
        I.addReg(TmpVGPR, RegState::ImplicitDefine);
      // Spill needed lanes
@ -201,7 +207,7 @@ struct SGPRSpillBuilder {
        TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
                                    /*IsKill*/ false);
      // Spill inactive lanes
-      auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
      if (!TmpVGPRLive)
        I.addReg(TmpVGPR, RegState::ImplicitDefine);
      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
@ -224,7 +230,7 @@ struct SGPRSpillBuilder {
      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
                                  /*IsKill*/ false);
      // Restore exec
-      auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg)
+      auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
                   .addReg(SavedExecReg, RegState::Kill);
      // Add an implicit use of the load so it is not dead.
      // FIXME This inserts an unnecessary waitcnt
@ -235,7 +241,7 @@ struct SGPRSpillBuilder {
      // Restore inactive lanes
      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
                                  /*IsKill*/ false);
-      auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
      if (!TmpVGPRLive) {
        I.addReg(TmpVGPR, RegState::ImplicitKill);
      }
@ -261,11 +267,17 @@ struct SGPRSpillBuilder {
      TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
                                  /*IsKill*/ false);
      // Spill inactive lanes
-      BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
      TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
-      BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
    }
  }
+
+  void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
+    assert(MBB->getParent() == &MF);
+    MI = NewMI;
+    MBB = NewMBB;
+  }
 };

 } // namespace llvm
@ -1337,13 +1349,13 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
  if (IsLoad) {
    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
                                          : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
-    buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
+    buildSpillLoadStore(*SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
                        Offset * SB.EltSize, MMO, SB.RS);
  } else {
    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
                                          : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
-    buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg,
-                        Offset * SB.EltSize, MMO, SB.RS);
+    buildSpillLoadStore(*SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill,
+                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
    // This only ever adds one VGPR spill
    SB.MFI.addToSpilledVGPRs(1);
  }
@ -1381,8 +1393,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,

      // Mark the "old value of vgpr" input undef only if this is the first sgpr
      // spill to this specific vgpr in the first basic block.
-      auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
-                         Spill.VGPR)
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
+                         SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
                     .addReg(SubReg, getKillRegState(UseKill))
                     .addImm(Spill.Lane)
                     .addReg(Spill.VGPR);
@ -1428,7 +1440,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));

        MachineInstrBuilder WriteLane =
-            BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+            BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
                    SB.TmpVGPR)
                .addReg(SubReg, SubKillState)
                .addImm(i % PVD.PerVGPR)
@ -1490,10 +1502,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));

      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
-      auto MIB =
-          BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
-              .addReg(Spill.VGPR)
-              .addImm(Spill.Lane);
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
+                         SubReg)
+                     .addReg(Spill.VGPR)
+                     .addImm(Spill.Lane);
      if (SB.NumSubRegs > 1 && i == 0)
        MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
      if (LIS) {
@ -1524,7 +1536,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));

        bool LastSubReg = (i + 1 == e);
-        auto MIB = BuildMI(SB.MBB, MI, SB.DL,
+        auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
                           SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
                       .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
                       .addImm(i);
@ -1550,6 +1562,75 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
  return true;
 }

+bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
+                                        MachineBasicBlock &RestoreMBB,
+                                        Register SGPR, RegScavenger *RS) const {
+  SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
+                      RS);
+  SB.prepare();
+  // Generate the spill of SGPR to SB.TmpVGPR.
+  unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
+  auto PVD = SB.getPerVGPRData();
+  for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
+    unsigned TmpVGPRFlags = RegState::Undef;
+    // Write sub registers into the VGPR
+    for (unsigned i = Offset * PVD.PerVGPR,
+                  e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
+         i < e; ++i) {
+      Register SubReg =
+          SB.NumSubRegs == 1
+              ? SB.SuperReg
+              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
+
+      MachineInstrBuilder WriteLane =
+          BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+                  SB.TmpVGPR)
+              .addReg(SubReg, SubKillState)
+              .addImm(i % PVD.PerVGPR)
+              .addReg(SB.TmpVGPR, TmpVGPRFlags);
+      TmpVGPRFlags = 0;
+      // There could be undef components of a spilled super register.
+      // TODO: Can we detect this and skip the spill?
+      if (SB.NumSubRegs > 1) {
+        // The last implicit use of the SB.SuperReg carries the "Kill" flag.
+        unsigned SuperKillState = 0;
+        if (i + 1 == SB.NumSubRegs)
+          SuperKillState |= getKillRegState(SB.IsKill);
+        WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
+      }
+    }
+    // Don't need to write VGPR out.
+  }
+
+  // Restore clobbered registers in the specified restore block.
+  MI = RestoreMBB.end();
+  SB.setMI(&RestoreMBB, MI);
+  // Generate the restore of SGPR from SB.TmpVGPR.
+  for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
+    // Don't need to load VGPR in.
+    // Unpack lanes
+    for (unsigned i = Offset * PVD.PerVGPR,
+                  e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
+         i < e; ++i) {
+      Register SubReg =
+          SB.NumSubRegs == 1
+              ? SB.SuperReg
+              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
+      bool LastSubReg = (i + 1 == e);
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
+                         SubReg)
+                     .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
+                     .addImm(i);
+      if (SB.NumSubRegs > 1 && i == 0)
+        MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+    }
+  }
+  SB.restore();
+
+  SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
+  return false;
+}
+
 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
 /// a VGPR and the stack slot can be safely eliminated when all other users are
 /// handled.
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@ -130,6 +130,10 @@ public:
                   LiveIntervals *LIS = nullptr,
                   bool OnlyToVGPR = false) const;

+  bool spillEmergencySGPR(MachineBasicBlock::iterator MI,
+                          MachineBasicBlock &RestoreMBB, Register SGPR,
+                          RegScavenger *RS) const;
+
  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                           unsigned FIOperandNum,
                           RegScavenger *RS) const override;
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@ -560,19 +560,19 @@ bool AVRInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
  }
 }

-unsigned AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
-                                            MachineBasicBlock &NewDestBB,
-                                            const DebugLoc &DL,
-                                            int64_t BrOffset,
-                                            RegScavenger *RS) const {
+void AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock &NewDestBB,
+                                        MachineBasicBlock &RestoreBB,
+                                        const DebugLoc &DL, int64_t BrOffset,
+                                        RegScavenger *RS) const {
  // This method inserts a *direct* branch (JMP), despite its name.
  // LLVM calls this method to fixup unconditional branches; it never calls
  // insertBranch or some hypothetical "insertDirectBranch".
  // See lib/CodeGen/RegisterRelaxation.cpp for details.
  // We end up here when a jump is too long for a RJMP instruction.
-  auto &MI = *BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB);
+  BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB);

-  return getInstSizeInBytes(MI);
+  return;
 }

 } // end of namespace llvm
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@ -107,10 +107,10 @@ public:
  bool isBranchOffsetInRange(unsigned BranchOpc,
                             int64_t BrOffset) const override;

-  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                MachineBasicBlock &NewDestBB,
-                                const DebugLoc &DL, int64_t BrOffset,
-                                RegScavenger *RS) const override;
+  void insertIndirectBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock &NewDestBB,
+                            MachineBasicBlock &RestoreBB, const DebugLoc &DL,
+                            int64_t BrOffset, RegScavenger *RS) const override;

 private:
  const AVRRegisterInfo RI;
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@ -684,11 +684,11 @@ unsigned RISCVInstrInfo::insertBranch(
  return 2;
 }

-unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
-                                              MachineBasicBlock &DestBB,
-                                              const DebugLoc &DL,
-                                              int64_t BrOffset,
-                                              RegScavenger *RS) const {
+void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                          MachineBasicBlock &DestBB,
+                                          MachineBasicBlock &RestoreBB,
+                                          const DebugLoc &DL, int64_t BrOffset,
+                                          RegScavenger *RS) const {
  assert(RS && "RegScavenger required for long branching");
  assert(MBB.empty() &&
         "new block should be inserted for expanding unconditional branch");
@ -714,10 +714,11 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
  RS->enterBasicBlockEnd(MBB);
  unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
                                                MI.getIterator(), false, 0);
+  // TODO: The case when there is no scavenged register needs special handling.
+  assert(Scav != RISCV::NoRegister && "No register is scavenged!");
  MRI.replaceRegWith(ScratchReg, Scav);
  MRI.clearVirtRegs();
  RS->setRegUsed(Scav);
-  return 8;
 }

 bool RISCVInstrInfo::reverseBranchCondition(
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@ -85,10 +85,10 @@ public:
                        const DebugLoc &dl,
                        int *BytesAdded = nullptr) const override;

-  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                MachineBasicBlock &NewDestBB,
-                                const DebugLoc &DL, int64_t BrOffset,
-                                RegScavenger *RS = nullptr) const override;
+  void insertIndirectBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock &NewDestBB,
+                            MachineBasicBlock &RestoreBB, const DebugLoc &DL,
+                            int64_t BrOffset, RegScavenger *RS) const override;

  unsigned removeBranch(MachineBasicBlock &MBB,
                        int *BytesRemoved = nullptr) const override;
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll