[AMDGPU] Do not reserve any VGPR for SGPR spills

After the split register allocation changes in eebe841a47 it is no longer necessary to reserve a VGPR before RA. This can also create bugs when IPRA is enabled since we cannot predict that a called function may not reserve any register if it does not have any SGPR spills. If that happens those functions may override reserved registers that are normally callee saved. Added a test to show this. Fixes: SWDEV-309900 Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D115551
2022-01-11 22:11:17 -08:00 · 2022-01-11 22:11:17 -08:00 · 8470bf2b08
parent bbced74199
commit 8470bf2b08
7 changed files with 223 additions and 229 deletions
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -1320,16 +1320,14 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
  const BitVector AllSavedRegs = SavedRegs;
  SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());

-  // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
-  const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
-
  // We have to anticipate introducing CSR VGPR spills or spill of caller
  // save VGPR reserved for SGPR spills as we now always create stack entry
-  // for it, if we don't have any stack objects already, since we require
-  // an FP if there is a call and stack.
+  // for it, if we don't have any stack objects already, since we require a FP
+  // if there is a call and stack. We will allocate a VGPR for SGPR spills if
+  // there are any SGPR spills. Whether they are CSR spills or otherwise.
  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
  const bool WillHaveFP =
-      FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill);
+      FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());

  // FP will be specially managed like SP.
  if (WillHaveFP || hasFP(MF))
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -45,10 +45,6 @@ static cl::opt<bool> DisableLoopAlignment(
  cl::desc("Do not align and prefetch loops"),
  cl::init(false));

-static cl::opt<bool> VGPRReserveforSGPRSpill(
-    "amdgpu-reserve-vgpr-for-sgpr-spill",
-    cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
-
 static cl::opt<bool> UseDivergentRegisterIndexing(
  "amdgpu-use-divergent-register-indexing",
  cl::Hidden,
@ -11990,13 +11986,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
  }

  TargetLoweringBase::finalizeLowering(MF);
-
-  // Allocate a VGPR for future SGPR Spill if
-  // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
-  // FIXME: We won't need this hack if we split SGPR allocation from VGPR
-  if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
-      !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
-    Info->reserveVGPRforSGPRSpills(MF);
 }

 void SITargetLowering::computeKnownBitsForFrameIndex(
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@ -239,50 +239,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
  return false;
 }

-// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
-static bool lowerShiftReservedVGPR(MachineFunction &MF,
-                                   const GCNSubtarget &ST) {
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
-  // Early out if pre-reservation of a VGPR for SGPR spilling is disabled.
-  if (!PreReservedVGPR)
-    return false;
-
-  // If there are no free lower VGPRs available, default to using the
-  // pre-reserved register instead.
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  Register LowestAvailableVGPR =
-      TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF);
-  if (!LowestAvailableVGPR)
-    LowestAvailableVGPR = PreReservedVGPR;
-
-  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  // Create a stack object for a possible spill in the function prologue.
-  // Note Non-CSR VGPR also need this as we may overwrite inactive lanes.
-  Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4));
-
-  // Find saved info about the pre-reserved register.
-  const auto *ReservedVGPRInfoItr =
-      llvm::find_if(FuncInfo->getSGPRSpillVGPRs(),
-                    [PreReservedVGPR](const auto &SpillRegInfo) {
-                      return SpillRegInfo.VGPR == PreReservedVGPR;
-                    });
-
-  assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end());
-  auto Index =
-      std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr);
-
-  FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
-
-  for (MachineBasicBlock &MBB : MF) {
-    assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
-    MBB.addLiveIn(LowestAvailableVGPR);
-    MBB.sortUniqueLiveIns();
-  }
-
-  return true;
-}
-
 bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  TII = ST.getInstrInfo();
@ -304,11 +260,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
  if (!MFI.hasStackObjects() && !HasCSRs) {
    SaveBlocks.clear();
    RestoreBlocks.clear();
-    if (FuncInfo->VGPRReservedForSGPRSpill) {
-      // Free the reserved VGPR for later possible use by frame lowering.
-      FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
-      MRI.freezeReservedRegs(MF);
-    }
    return false;
  }

@ -326,8 +277,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
    // This operates under the assumption that only other SGPR spills are users
    // of the frame index.

-    lowerShiftReservedVGPR(MF, ST);
-
    // To track the spill frame indices handled in this pass.
    BitVector SpillFIs(MFI.getObjectIndexEnd(), false);

@ -375,8 +324,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
    FuncInfo->removeDeadFrameIndices(MFI);

    MadeChange = true;
-  } else if (FuncInfo->VGPRReservedForSGPRSpill) {
-    FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
  }

  SaveBlocks.clear();
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -274,7 +274,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
  MachineRegisterInfo &MRI = MF.getRegInfo();
  unsigned WaveSize = ST.getWavefrontSize();
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();

  unsigned Size = FrameInfo.getObjectSize(FI);
  unsigned NumLanes = Size / 4;
@ -291,16 +290,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
    Register LaneVGPR;
    unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);

-    // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and
-    // when one of the two conditions is true:
-    // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet
-    // reserved.
-    // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is
-    // required.
-    if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) {
-      assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR);
-      LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill;
-    } else if (VGPRIndex == 0) {
+    if (VGPRIndex == 0) {
      LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
      if (LaneVGPR == AMDGPU::NoRegister) {
        // We have no VGPRs left for spilling SGPRs. Reset because we will not
@ -308,6 +298,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
        SGPRToVGPRSpills.erase(FI);
        NumVGPRSpillLanes -= I;

+        // FIXME: We can run out of free registers with split allocation if
+        // IPRA is enabled and a called function already uses every VGPR.
 #if 0
        DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
                                                  "VGPRs for SGPR spilling",
@ -340,21 +332,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
  return true;
 }

-/// Reserve a VGPR for spilling of SGPRs
-bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-
-  Register LaneVGPR = TRI->findUnusedRegister(
-      MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
-  if (LaneVGPR == Register())
-    return false;
-  SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None));
-  FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
-  return true;
-}
-
 /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
 /// Either AGPR is spilled to VGPR to vice versa.
 /// Returns true if a \p FI can be eliminated completely.
@ -616,24 +593,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
  return false;
 }

-// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs
-bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
-                                                   MachineFunction &MF) {
-  for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) {
-    if (i->VGPR == ReservedVGPR) {
-      SpillVGPRs.erase(i);
-
-      for (MachineBasicBlock &MBB : MF) {
-        MBB.removeLiveIn(ReservedVGPR);
-        MBB.sortUniqueLiveIns();
-      }
-      this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister;
-      return true;
-    }
-  }
-  return false;
-}
-
 bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
  if (UsesAGPRs)
    return *UsesAGPRs;
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -502,7 +502,6 @@ public: // FIXME
  Register SGPRForBPSaveRestoreCopy;
  Optional<int> BasePointerSaveIndex;

-  Register VGPRReservedForSGPRSpill;
  bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);

 public:
@ -528,7 +527,6 @@ public:
  void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
    SpillVGPRs[Index].VGPR = NewVGPR;
    SpillVGPRs[Index].FI = newFI;
-    VGPRReservedForSGPRSpill = NewVGPR;
  }

  bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
@ -556,7 +554,6 @@ public:
  bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                 unsigned NumLane) const;
  bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
-  bool reserveVGPRforSGPRSpills(MachineFunction &MF);
  bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
  void removeDeadFrameIndices(MachineFrameInfo &MFI);

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@ -520,58 +520,58 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:176
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:192
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:224
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
 ; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
 ; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
@ -582,50 +582,50 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
@ -676,10 +676,10 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
 ; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:492
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
--- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
@ -5,7 +5,7 @@ define void @child_function() #0 {
  ret void
 }

-; GCN-LABEL: {{^}}reserve_vgpr_with_no_lower_vgpr_available:
+; GCN-LABEL: {{^}}spill_sgpr_with_no_lower_vgpr_available:
 ; GCN:  buffer_store_dword v255, off, s[0:3], s32
 ; GCN:  v_writelane_b32 v255, s33, 2
 ; GCN:  v_writelane_b32 v255, s30, 0
@ -16,7 +16,7 @@ define void @child_function() #0 {
 ; GCN:  v_readlane_b32 s33, v255, 2
 ; GCN: ; NumVgprs: 256

-define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
+define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca

@ -51,7 +51,7 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
  ret void
 }

-; GCN-LABEL: {{^}}reserve_lowest_available_vgpr:
+; GCN-LABEL: {{^}}spill_to_lowest_available_vgpr:
 ; GCN:  buffer_store_dword v254, off, s[0:3], s32
 ; GCN:  v_writelane_b32 v254, s33, 2
 ; GCN:  v_writelane_b32 v254, s30, 0
@ -61,7 +61,7 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
 ; GCN:  v_readlane_b32 s31, v254, 1
 ; GCN:  v_readlane_b32 s33, v254, 2

-define void @reserve_lowest_available_vgpr() #0 {
+define void @spill_to_lowest_available_vgpr() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca

@ -96,14 +96,14 @@ define void @reserve_lowest_available_vgpr() #0 {
  ret void
 }

-; GCN-LABEL: {{^}}reserve_vgpr_with_sgpr_spills:
+; GCN-LABEL: {{^}}spill_sgpr_with_sgpr_uses:
 ; GCN-NOT:  buffer_store_dword v255, off, s[0:3], s32
 ; GCN: ; def s4
 ; GCN: v_writelane_b32 v254, s4, 2
 ; GCN: v_readlane_b32 s4, v254, 2
 ; GCN: ; use s4

-define void @reserve_vgpr_with_sgpr_spills() #0 {
+define void @spill_sgpr_with_sgpr_uses() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca

@ -147,12 +147,12 @@ ret:
  ret void
 }

-; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call
+; GCN-LABEL: {{^}}spill_sgpr_with_tail_call
 ; GCN-NOT:  buffer_store_dword v255, off, s[0:3], s32
 ; GCN-NOT:  v_writelane
 ; GCN:  s_setpc_b64 s[4:5]

-define void @reserve_vgpr_with_tail_call() #0 {
+define void @spill_sgpr_with_tail_call() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca

@ -187,17 +187,29 @@ define void @reserve_vgpr_with_tail_call() #0 {
  ret void
 }

-; GCN-LABEL: {{^}}reserve_vgpr_for_sgpr_spills_no_alloca:
-; GCN:  v_writelane_b32 v5, s34, 0
-; GCN:  v_writelane_b32 v5, s35, 1
-; GCN:  v_writelane_b32 v5, s36, 2
-; GCN:  v_writelane_b32 v5, s37, 3
-; GCN:  v_readlane_b32 s37, v5, 3
-; GCN:  v_readlane_b32 s36, v5, 2
-; GCN:  v_readlane_b32 s35, v5, 1
-; GCN:  v_readlane_b32 s34, v5, 0
+; Special case where all registers are explicitly clobbered in the function and
+; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory.

-define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr:
+; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0
+; GCN: buffer_store_dword [[A]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0
+; GCN: buffer_store_dword [[B]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0
+; GCN: buffer_store_dword [[C]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0
+; GCN: buffer_store_dword [[D]], off, s[0:3], s32
+; GCN: #ASMEND
+; GCN: buffer_load_dword [[E:v[0-9]+]]
+; GCN: v_readlane_b32 s37, [[E]], 0
+; GCN: buffer_load_dword [[F:v[0-9]+]]
+; GCN: v_readlane_b32 s36, [[F]], 0
+; GCN: buffer_load_dword [[G:v[0-9]+]]
+; GCN: v_readlane_b32 s35, [[G]], 0
+; GCN: buffer_load_dword [[H:v[0-9]+]]
+; GCN: v_readlane_b32 s34, [[H]], 0
+
+define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
  call void asm sideeffect "",
  "~{v6},~{v7},~{v8},~{v9}
@ -234,4 +246,96 @@ define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out
  ret void
 }

+; If IPRA no-CSR optimization is enabled, we will not be able to allocate an
+; SGPR for VGPR spills in the parent function since this child function uses all
+; VGPRs.
+
+define internal void @child_function_ipra() #0 {
+  call void asm sideeffect "",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+  ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+  ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+  ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
+  ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
+  ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
+  ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
+  ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
+  ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
+  ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
+  ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
+  ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
+  ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
+  ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
+  ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
+  ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
+  ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
+  ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
+  ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
+  ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
+  ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
+  ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
+  ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
+  ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
+  ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
+  ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra:
+; GCN: v_writelane_b32 v0, s30, 0
+; GCN: v_writelane_b32 v0, s31, 1
+; GCN: buffer_store_dword v0, off
+; GCN: swappc
+; GCN: buffer_load_dword v0, off
+; GCN: v_readlane_b32 s30, v0, 0
+; GCN: v_readlane_b32 s31, v0, 1
+define void @spill_sgpr_no_free_vgpr_ipra() #0 {
+  call void @child_function_ipra()
+  ret void
+}
+
+define internal void @child_function_ipra_tail_call() #0 {
+  call void asm sideeffect "",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+  ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+  ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+  ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
+  ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
+  ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
+  ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
+  ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
+  ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
+  ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
+  ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
+  ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
+  ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
+  ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
+  ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
+  ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
+  ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
+  ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
+  ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
+  ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
+  ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
+  ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
+  ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
+  ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
+  ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
+  ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra_tail_call:
+; GCN-NOT: v_writelane_b32
+; GCN-NOT: buffer_store_dword
+; GCN-NOT: swappc
+; GCN-NOT: buffer_load_dword v0, off
+; GCN-NOT: v_readlane_b32
+; GCN: setpc
+define void @spill_sgpr_no_free_vgpr_ipra_tail_call() #0 {
+  tail call void @child_function_ipra_tail_call()
+  ret void
+}
+
+
 attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }