[AMDGPU] gfx908 agpr spilling

Differential Revision: https://reviews.llvm.org/D64594 llvm-svn: 365833
2019-07-11 21:54:13 +00:00 · 2019-07-11 21:54:13 +00:00 · 937ff6e701
parent 18b78bfe9e
commit 937ff6e701
9 changed files with 762 additions and 44 deletions
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -913,7 +913,6 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
  return true;
 }

-
 #ifndef NDEBUG
 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
                                 Optional<int> FramePointerSaveIndex) {
@ -947,6 +946,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();

+  FuncInfo->removeDeadFrameIndices(MFI);
  assert(allSGPRSpillsAreDead(MFI, None) &&
         "SGPR spill should have been removed in SILowerSGPRSpills");

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -976,6 +976,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
    return AMDGPU::SI_SPILL_S256_SAVE;
  case 64:
    return AMDGPU::SI_SPILL_S512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_S1024_SAVE;
  default:
    llvm_unreachable("unknown register size");
  }
@ -997,6 +999,25 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
    return AMDGPU::SI_SPILL_V256_SAVE;
  case 64:
    return AMDGPU::SI_SPILL_V512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_V1024_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_A32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_A64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_A128_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_A512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_A1024_SAVE;
  default:
    llvm_unreachable("unknown register size");
  }
@ -1055,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
    return;
  }

-  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
-  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
+  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+                                    : getVGPRSpillSaveOpcode(SpillSize);
  MFI->setHasSpilledVGPRs();
-  BuildMI(MBB, MI, DL, get(Opcode))
-    .addReg(SrcReg, getKillRegState(isKill)) // data
-    .addFrameIndex(FrameIndex)               // addr
-    .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
-    .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
-    .addImm(0)                               // offset
-    .addMemOperand(MMO);
+
+  auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
+  if (RI.hasAGPRs(RC)) {
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    MIB.addReg(Tmp, RegState::Define);
+  }
+  MIB.addReg(SrcReg, getKillRegState(isKill)) // data
+     .addFrameIndex(FrameIndex)               // addr
+     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
+     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
+     .addImm(0)                               // offset
+     .addMemOperand(MMO);
 }

 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
@ -1084,6 +1110,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
    return AMDGPU::SI_SPILL_S256_RESTORE;
  case 64:
    return AMDGPU::SI_SPILL_S512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_S1024_RESTORE;
  default:
    llvm_unreachable("unknown register size");
  }
@ -1105,6 +1133,25 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
    return AMDGPU::SI_SPILL_V256_RESTORE;
  case 64:
    return AMDGPU::SI_SPILL_V512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_V1024_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_A32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_A64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_A128_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_A512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_A1024_RESTORE;
  default:
    llvm_unreachable("unknown register size");
  }
@ -1156,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
    return;
  }

-  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
-  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
-  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-    .addFrameIndex(FrameIndex)           // vaddr
-    .addReg(MFI->getScratchRSrcReg())    // scratch_rsrc
-    .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
-    .addImm(0)                           // offset
-    .addMemOperand(MMO);
+  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+                                    : getVGPRSpillRestoreOpcode(SpillSize);
+  auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
+  if (RI.hasAGPRs(RC)) {
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    MIB.addReg(Tmp, RegState::Define);
+  }
+  MIB.addFrameIndex(FrameIndex)        // vaddr
+     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+     .addImm(0)                           // offset
+     .addMemOperand(MMO);
 }

 /// \param @Offset Offset in bytes of the FrameIndex being spilled
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -513,6 +513,7 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;

 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
  let UseNamedOperandTable = 1, VGPRSpill = 1,
@ -524,7 +525,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
      let mayStore = 1;
      let mayLoad = 0;
      // (2 * 4) + (8 * num_subregs) bytes maximum
-      let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
    }

    def _RESTORE : VPseudoInstSI <
@ -535,7 +538,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
      let mayLoad = 1;

      // (2 * 4) + (8 * num_subregs) bytes maximum
-      let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
    }
  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
 }
@ -547,6 +552,44 @@ defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
 defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
 defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+
+multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
+  let UseNamedOperandTable = 1, VGPRSpill = 1,
+      Constraints = "@earlyclobber $tmp",
+      SchedRW = [WriteVMEM] in {
+    def _SAVE : VPseudoInstSI <
+      (outs VGPR_32:$tmp),
+      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+           SReg_32:$soffset, i32imm:$offset)> {
+      let mayStore = 1;
+      let mayLoad = 0;
+      // (2 * 4) + (16 * num_subregs) bytes maximum
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+    }
+
+    def _RESTORE : VPseudoInstSI <
+      (outs vgpr_class:$vdata, VGPR_32:$tmp),
+      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+           i32imm:$offset)> {
+      let mayStore = 0;
+      let mayLoad = 1;
+
+      // (2 * 4) + (16 * num_subregs) bytes maximum
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+    }
+  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_A32  : SI_SPILL_AGPR <AGPR_32>;
+defm SI_SPILL_A64  : SI_SPILL_AGPR <AReg_64>;
+defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
+defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
+defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;

 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
  (outs SReg_64:$dst),
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@ -37,6 +37,12 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>;

 namespace {

+static cl::opt<bool> EnableSpillVGPRToAGPR(
+  "amdgpu-spill-vgpr-to-agpr",
+  cl::desc("Enable spilling VGPRs to AGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 class SILowerSGPRSpills : public MachineFunctionPass {
 private:
  const SIRegisterInfo *TRI = nullptr;
@ -242,10 +248,22 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
    return false;
  }

+  MachineRegisterInfo &MRI = MF.getRegInfo();
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  bool AllSGPRSpilledToVGPRs = false;
+  const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+    && EnableSpillVGPRToAGPR;
+
  bool MadeChange = false;

-  if (TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) {
+  const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
+
+  // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
+  // handled as SpilledToReg in regular PrologEpilogInserter.
+  if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) ||
+      SpillVGPRToAGPR) {
+    AllSGPRSpilledToVGPRs = true;
+
    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
    // are spilled to VGPRs, in which case we can eliminate the stack usage.
    //
@ -257,6 +275,18 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
        MachineInstr &MI = *I;
        Next = std::next(I);

+        if (SpillToAGPR && TII->isVGPRSpill(MI)) {
+          // Try to eliminate stack used by VGPR spills before frame
+          // finalization.
+          unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                     AMDGPU::OpName::vaddr);
+          int FI = MI.getOperand(FIOp).getIndex();
+          unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
+            ->getReg();
+          if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg)))
+            TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
+        }
+
        if (!TII->isSGPRSpill(MI))
          continue;

@ -266,18 +296,24 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
          bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
          (void)Spilled;
          assert(Spilled && "failed to spill SGPR to VGPR when allocated");
-        }
-
+        } else
+          AllSGPRSpilledToVGPRs = false;
      }
    }

    for (MachineBasicBlock &MBB : MF) {
      for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
        MBB.addLiveIn(SSpill.VGPR);
+
+      for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+        MBB.addLiveIn(Reg);
+
+      for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+        MBB.addLiveIn(Reg);
+
      MBB.sortUniqueLiveIns();
    }

-    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
    MadeChange = true;
  }

--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -319,7 +319,75 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
  return true;
 }

-void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
+/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
+/// Either AGPR is spilled to VGPR to vice versa.
+/// Returns true if a \p FI can be eliminated completely.
+bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
+                                                    int FI,
+                                                    bool isAGPRtoVGPR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  const GCNSubtarget &ST =  MF.getSubtarget<GCNSubtarget>();
+
+  assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
+
+  auto &Spill = VGPRToAGPRSpills[FI];
+
+  // This has already been allocated.
+  if (!Spill.Lanes.empty())
+    return Spill.FullyAllocated;
+
+  unsigned Size = FrameInfo.getObjectSize(FI);
+  unsigned NumLanes = Size / 4;
+  Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
+
+  const TargetRegisterClass &RC =
+      isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
+  auto Regs = RC.getRegisters();
+
+  auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  Spill.FullyAllocated = true;
+
+  // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
+  // once.
+  BitVector OtherUsedRegs;
+  OtherUsedRegs.resize(TRI->getNumRegs());
+
+  const uint32_t *CSRMask =
+      TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+  if (CSRMask)
+    OtherUsedRegs.setBitsInMask(CSRMask);
+
+  // TODO: Should include register tuples, but doesn't matter with current
+  // usage.
+  for (MCPhysReg Reg : SpillAGPR)
+    OtherUsedRegs.set(Reg);
+  for (MCPhysReg Reg : SpillVGPR)
+    OtherUsedRegs.set(Reg);
+
+  SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
+  for (unsigned I = 0; I < NumLanes; ++I) {
+    NextSpillReg = std::find_if(
+        NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
+          return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
+                 !OtherUsedRegs[Reg];
+        });
+
+    if (NextSpillReg == Regs.end()) { // Registers exhausted
+      Spill.FullyAllocated = false;
+      break;
+    }
+
+    OtherUsedRegs.set(*NextSpillReg);
+    SpillRegs.push_back(*NextSpillReg);
+    Spill.Lanes[I] = *NextSpillReg++;
+  }
+
+  return Spill.FullyAllocated;
+}
+
+void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
  // The FP spill hasn't been inserted yet, so keep it around.
  for (auto &R : SGPRToVGPRSpills) {
    if (R.first != FramePointerSaveIndex)
@ -332,6 +400,11 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI)
       ++i)
    if (i != FramePointerSaveIndex)
      MFI.setStackID(i, TargetStackID::Default);
+
+  for (auto &R : VGPRToAGPRSpills) {
+    if (R.second.FullyAllocated)
+      MFI.RemoveStackObject(R.first);
+  }
 }

 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -442,6 +442,11 @@ public:
    SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
  };

+  struct VGPRSpillToAGPR {
+    SmallVector<MCPhysReg, 32> Lanes;
+    bool FullyAllocated = false;
+  };
+
  SparseBitVector<> WWMReservedRegs;

  void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
@ -456,6 +461,14 @@ private:
  unsigned NumVGPRSpillLanes = 0;
  SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;

+  DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
+
+  // AGPRs used for VGPR spills.
+  SmallVector<MCPhysReg, 32> SpillAGPR;
+
+  // VGPRs used for AGPR spills.
+  SmallVector<MCPhysReg, 32> SpillVGPR;
+
 public: // FIXME
  /// If this is set, an SGPR used for save/restore of the register used for the
  /// frame pointer.
@ -477,6 +490,20 @@ public:
    return SpillVGPRs;
  }

+  ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
+    return SpillAGPR;
+  }
+
+  ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
+    return SpillVGPR;
+  }
+
+  MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const {
+    auto I = VGPRToAGPRSpills.find(FrameIndex);
+    return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister
+                                         : I->second.Lanes[Lane];
+  }
+
  AMDGPU::SIModeRegisterDefaults getMode() const {
    return Mode;
  }
@ -484,7 +511,8 @@ public:
  bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                 unsigned NumLane) const;
  bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
-  void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+  bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
+  void removeDeadFrameIndices(MachineFrameInfo &MFI);

  bool hasCalculatedTID() const { return TIDReg != 0; };
  unsigned getTIDReg() const { return TIDReg; };
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -256,6 +256,13 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    reserveRegisterTuples(Reserved, Reg);
  }

+  // FIXME: Stop using reserved registers for this.
+  for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
+    reserveRegisterTuples(Reserved, Reg);
+
+  for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
+    reserveRegisterTuples(Reserved, Reg);
+
  return Reserved;
 }

@ -448,10 +455,19 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
 static unsigned getNumSubRegsForSpillOp(unsigned Op) {

  switch (Op) {
+  case AMDGPU::SI_SPILL_S1024_SAVE:
+  case AMDGPU::SI_SPILL_S1024_RESTORE:
+  case AMDGPU::SI_SPILL_V1024_SAVE:
+  case AMDGPU::SI_SPILL_V1024_RESTORE:
+  case AMDGPU::SI_SPILL_A1024_SAVE:
+  case AMDGPU::SI_SPILL_A1024_RESTORE:
+    return 32;
  case AMDGPU::SI_SPILL_S512_SAVE:
  case AMDGPU::SI_SPILL_S512_RESTORE:
  case AMDGPU::SI_SPILL_V512_SAVE:
  case AMDGPU::SI_SPILL_V512_RESTORE:
+  case AMDGPU::SI_SPILL_A512_SAVE:
+  case AMDGPU::SI_SPILL_A512_RESTORE:
    return 16;
  case AMDGPU::SI_SPILL_S256_SAVE:
  case AMDGPU::SI_SPILL_S256_RESTORE:
@ -467,6 +483,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
  case AMDGPU::SI_SPILL_S128_RESTORE:
  case AMDGPU::SI_SPILL_V128_SAVE:
  case AMDGPU::SI_SPILL_V128_RESTORE:
+  case AMDGPU::SI_SPILL_A128_SAVE:
+  case AMDGPU::SI_SPILL_A128_RESTORE:
    return 4;
  case AMDGPU::SI_SPILL_S96_SAVE:
  case AMDGPU::SI_SPILL_S96_RESTORE:
@ -477,11 +495,15 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
  case AMDGPU::SI_SPILL_S64_RESTORE:
  case AMDGPU::SI_SPILL_V64_SAVE:
  case AMDGPU::SI_SPILL_V64_RESTORE:
+  case AMDGPU::SI_SPILL_A64_SAVE:
+  case AMDGPU::SI_SPILL_A64_RESTORE:
    return 2;
  case AMDGPU::SI_SPILL_S32_SAVE:
  case AMDGPU::SI_SPILL_S32_RESTORE:
  case AMDGPU::SI_SPILL_V32_SAVE:
  case AMDGPU::SI_SPILL_V32_RESTORE:
+  case AMDGPU::SI_SPILL_A32_SAVE:
+  case AMDGPU::SI_SPILL_A32_RESTORE:
    return 1;
  default: llvm_unreachable("Invalid spill opcode");
  }
@ -541,6 +563,35 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
  }
 }

+static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+                                           int Index,
+                                           unsigned Lane,
+                                           unsigned ValueReg,
+                                           bool IsKill) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MI->getParent()->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
+
+  if (Reg == AMDGPU::NoRegister)
+    return MachineInstrBuilder();
+
+  bool IsStore = MI->mayStore();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+
+  unsigned Dst = IsStore ? Reg : ValueReg;
+  unsigned Src = IsStore ? ValueReg : Reg;
+  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
+                                                   : AMDGPU::V_ACCVGPR_READ_B32;
+
+  return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+           .addReg(Src, getKillRegState(IsKill));
+}
+
 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
 // need to handle the case where an SGPR may need to be spilled while spilling.
 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
@ -559,6 +610,9 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
    return false;

  const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+  if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+    return true;
+
  MachineInstrBuilder NewMI =
      BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
          .add(*Reg)
@ -611,6 +665,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
  unsigned Align = MFI.getObjectAlignment(Index);
  const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();

+  Register TmpReg =
+    hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
+                 : Register();
+
  assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");

  if (!isUInt<12>(Offset + Size - EltSize)) {
@ -659,21 +717,38 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
      SrcDstRegState |= getKillRegState(IsKill);
    }

-    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
-    MachineMemOperand *NewMMO
-      = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
-                                 EltSize, MinAlign(Align, EltSize * i));
+    auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);

-    auto MIB = BuildMI(*MBB, MI, DL, Desc)
-      .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
-      .addReg(ScratchRsrcReg)
-      .addReg(SOffset, SOffsetRegState)
-      .addImm(Offset)
-      .addImm(0) // glc
-      .addImm(0) // slc
-      .addImm(0) // tfe
-      .addImm(0) // dlc
-      .addMemOperand(NewMMO);
+    if (!MIB.getInstr()) {
+      unsigned FinalReg = SubReg;
+      if (TmpReg != AMDGPU::NoRegister) {
+        if (IsStore)
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
+            .addReg(SubReg, getKillRegState(IsKill));
+        SubReg = TmpReg;
+      }
+
+      MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+      MachineMemOperand *NewMMO
+        = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+                                   EltSize, MinAlign(Align, EltSize * i));
+
+      MIB = BuildMI(*MBB, MI, DL, Desc)
+        .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
+        .addReg(ScratchRsrcReg)
+        .addReg(SOffset, SOffsetRegState)
+        .addImm(Offset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // tfe
+        .addImm(0) // dlc
+        .addMemOperand(NewMMO);
+
+      if (!IsStore && TmpReg != AMDGPU::NoRegister)
+        MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
+                      FinalReg)
+          .addReg(TmpReg, RegState::Kill);
+    }

    if (NumSubRegs > 1)
      MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
@ -1038,6 +1113,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
  int FI,
  RegScavenger *RS) const {
  switch (MI->getOpcode()) {
+  case AMDGPU::SI_SPILL_S1024_SAVE:
  case AMDGPU::SI_SPILL_S512_SAVE:
  case AMDGPU::SI_SPILL_S256_SAVE:
  case AMDGPU::SI_SPILL_S160_SAVE:
@ -1046,6 +1122,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
  case AMDGPU::SI_SPILL_S64_SAVE:
  case AMDGPU::SI_SPILL_S32_SAVE:
    return spillSGPR(MI, FI, RS, true);
+  case AMDGPU::SI_SPILL_S1024_RESTORE:
  case AMDGPU::SI_SPILL_S512_RESTORE:
  case AMDGPU::SI_SPILL_S256_RESTORE:
  case AMDGPU::SI_SPILL_S160_RESTORE:
@ -1080,6 +1157,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,

  switch (MI->getOpcode()) {
    // SGPR register spill
+    case AMDGPU::SI_SPILL_S1024_SAVE:
    case AMDGPU::SI_SPILL_S512_SAVE:
    case AMDGPU::SI_SPILL_S256_SAVE:
    case AMDGPU::SI_SPILL_S160_SAVE:
@ -1092,6 +1170,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
    }

    // SGPR register restore
+    case AMDGPU::SI_SPILL_S1024_RESTORE:
    case AMDGPU::SI_SPILL_S512_RESTORE:
    case AMDGPU::SI_SPILL_S256_RESTORE:
    case AMDGPU::SI_SPILL_S160_RESTORE:
@ -1104,13 +1183,19 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
    }

    // VGPR register spill
+    case AMDGPU::SI_SPILL_V1024_SAVE:
    case AMDGPU::SI_SPILL_V512_SAVE:
    case AMDGPU::SI_SPILL_V256_SAVE:
    case AMDGPU::SI_SPILL_V160_SAVE:
    case AMDGPU::SI_SPILL_V128_SAVE:
    case AMDGPU::SI_SPILL_V96_SAVE:
    case AMDGPU::SI_SPILL_V64_SAVE:
-    case AMDGPU::SI_SPILL_V32_SAVE: {
+    case AMDGPU::SI_SPILL_V32_SAVE:
+    case AMDGPU::SI_SPILL_A1024_SAVE:
+    case AMDGPU::SI_SPILL_A512_SAVE:
+    case AMDGPU::SI_SPILL_A128_SAVE:
+    case AMDGPU::SI_SPILL_A64_SAVE:
+    case AMDGPU::SI_SPILL_A32_SAVE: {
      const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                         AMDGPU::OpName::vdata);
      assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@ -1134,7 +1219,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
    case AMDGPU::SI_SPILL_V128_RESTORE:
    case AMDGPU::SI_SPILL_V160_RESTORE:
    case AMDGPU::SI_SPILL_V256_RESTORE:
-    case AMDGPU::SI_SPILL_V512_RESTORE: {
+    case AMDGPU::SI_SPILL_V512_RESTORE:
+    case AMDGPU::SI_SPILL_V1024_RESTORE:
+    case AMDGPU::SI_SPILL_A32_RESTORE:
+    case AMDGPU::SI_SPILL_A64_RESTORE:
+    case AMDGPU::SI_SPILL_A128_RESTORE:
+    case AMDGPU::SI_SPILL_A512_RESTORE:
+    case AMDGPU::SI_SPILL_A1024_RESTORE: {
      const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                         AMDGPU::OpName::vdata);
      assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@ -0,0 +1,108 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s
+
+; GCN-LABEL: {{^}}max_24regs_32a_used:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
+  %elt1 = extractelement <16 x float> %mai.2, i32 0
+  %elt2 = extractelement <16 x float> %mai.1, i32 15
+  %elt3 = extractelement <16 x float> %mai.1, i32 14
+  %elt4 = extractelement <16 x float> %mai.2, i32 1
+  store float %elt1, float addrspace(1)* %out
+  %gep1 = getelementptr float, float addrspace(1)* %out, i64 1
+  store float %elt2, float addrspace(1)* %gep1
+  %gep2 = getelementptr float, float addrspace(1)* %out, i64 2
+  store float %elt3, float addrspace(1)* %gep2
+  %gep3 = getelementptr float, float addrspace(1)* %out, i64 3
+  store float %elt4, float addrspace(1)* %gep3
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_12regs_13a_used:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; A2V:        v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
+  br label %use
+
+use:
+  call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
+  store <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
+  br label %st
+
+st:
+  %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16
+  %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep1
+  store <4 x float> %mai.2, <4 x float> addrspace(1)* %gep2
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4)
+  call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_32regs_mfma32:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 {
+bb:
+  %v = call i32 asm sideeffect "", "=a"()
+  br label %use
+
+use:
+  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 2>, i32 0, i32 0, i32 0)
+  call void asm sideeffect "", "a"(i32 %v)
+  %elt1 = extractelement <32 x i32> %mai.1, i32 0
+  store i32 %elt1, i32 addrspace(1)* %arg
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
+declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+
+attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
+attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
+attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+
+; GCN-LABEL: {{^}}max_10_vgprs:
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX908-NOT: buffer_
+; GFX908-DAG  v_accvgpr_read_b32 v{{[0-9]}}, a0
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
+
+; GCN:    NumVgprs: 10
+; GFX900: ScratchSize: 12
+; GFX908: ScratchSize: 0
+; GCN:    VGPRBlocks: 2
+; GCN:    NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
+  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
+  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
+  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
+  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
+  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
+  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
+  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
+  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
+  %v1 = load volatile i32, i32 addrspace(1)* %p1
+  %v2 = load volatile i32, i32 addrspace(1)* %p2
+  %v3 = load volatile i32, i32 addrspace(1)* %p3
+  %v4 = load volatile i32, i32 addrspace(1)* %p4
+  %v5 = load volatile i32, i32 addrspace(1)* %p5
+  %v6 = load volatile i32, i32 addrspace(1)* %p6
+  %v7 = load volatile i32, i32 addrspace(1)* %p7
+  %v8 = load volatile i32, i32 addrspace(1)* %p8
+  %v9 = load volatile i32, i32 addrspace(1)* %p9
+  %v10 = load volatile i32, i32 addrspace(1)* %p10
+  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
+  store volatile i32 %v1, i32 addrspace(1)* undef
+  store volatile i32 %v2, i32 addrspace(1)* undef
+  store volatile i32 %v3, i32 addrspace(1)* undef
+  store volatile i32 %v4, i32 addrspace(1)* undef
+  store volatile i32 %v5, i32 addrspace(1)* undef
+  store volatile i32 %v6, i32 addrspace(1)* undef
+  store volatile i32 %v7, i32 addrspace(1)* undef
+  store volatile i32 %v8, i32 addrspace(1)* undef
+  store volatile i32 %v9, i32 addrspace(1)* undef
+  store volatile i32 %v10, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908:     v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GCN:        buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX908-NOT: buffer_
+; GFX908:     v_accvgpr_read_b32 v{{[0-9]}}, a9
+; GFX908:     buffer_load_dword v{{[0-9]}},
+; GFX908-NOT: buffer_
+
+; GCN:    NumVgprs: 10
+; GFX900: ScratchSize: 12
+; GFX908: ScratchSize: 8
+; GCN:    VGPRBlocks: 2
+; GCN:    NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
+  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
+  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
+  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
+  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
+  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
+  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
+  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
+  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
+  %v1 = load volatile i32, i32 addrspace(1)* %p1
+  %v2 = load volatile i32, i32 addrspace(1)* %p2
+  %v3 = load volatile i32, i32 addrspace(1)* %p3
+  %v4 = load volatile i32, i32 addrspace(1)* %p4
+  %v5 = load volatile i32, i32 addrspace(1)* %p5
+  %v6 = load volatile i32, i32 addrspace(1)* %p6
+  %v7 = load volatile i32, i32 addrspace(1)* %p7
+  %v8 = load volatile i32, i32 addrspace(1)* %p8
+  %v9 = load volatile i32, i32 addrspace(1)* %p9
+  %v10 = load volatile i32, i32 addrspace(1)* %p10
+  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
+  store volatile i32 %v1, i32 addrspace(1)* undef
+  store volatile i32 %v2, i32 addrspace(1)* undef
+  store volatile i32 %v3, i32 addrspace(1)* undef
+  store volatile i32 %v4, i32 addrspace(1)* undef
+  store volatile i32 %v5, i32 addrspace(1)* undef
+  store volatile i32 %v6, i32 addrspace(1)* undef
+  store volatile i32 %v7, i32 addrspace(1)* undef
+  store volatile i32 %v8, i32 addrspace(1)* undef
+  store volatile i32 %v9, i32 addrspace(1)* undef
+  store volatile i32 %v10, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-DAG: v_accvgpr_write_b32 a0, 1
+; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GCN-DAG:    buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GCN-DAG:    buffer_load_dword v{{[0-9]}},
+; GFX908-DAG  v_accvgpr_read_b32 v{{[0-9]}}, a1
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
+
+; GCN:    NumVgprs: 10
+; GFX900: ScratchSize: 44
+; GFX908: ScratchSize: 20
+; GCN:    VGPRBlocks: 2
+; GCN:    NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void asm sideeffect "", "a"(i32 1)
+  %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8
+  %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16
+  %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24
+  %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32
+  %v1 = load volatile i64, i64 addrspace(1)* %p1
+  %v2 = load volatile i64, i64 addrspace(1)* %p2
+  %v3 = load volatile i64, i64 addrspace(1)* %p3
+  %v4 = load volatile i64, i64 addrspace(1)* %p4
+  %v5 = load volatile i64, i64 addrspace(1)* %p5
+  call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5)
+  store volatile i64 %v1, i64 addrspace(1)* %p2
+  store volatile i64 %v2, i64 addrspace(1)* %p3
+  store volatile i64 %v3, i64 addrspace(1)* %p4
+  store volatile i64 %v4, i64 addrspace(1)* %p5
+  store volatile i64 %v5, i64 addrspace(1)* %p1
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-DAG: v_accvgpr_write_b32 a0,
+; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GCN-NOT:    a10
+; GCN:        buffer_store_dword v{{[0-9]}},
+
+; GFX908: NumVgprs: 10
+; GFX900: ScratchSize: 100
+; GFX908: ScratchSize: 68
+; GFX908: VGPRBlocks: 2
+; GFX908: NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
+  %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep
+  store volatile <32 x float> %v, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32:
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_write_b32 a0, v
+; GFX900:     buffer_store_dword v
+; GFX900:     buffer_load_dword v
+; GFX908-NOT: buffer_
+; GFX908-DAG  v_accvgpr_read_b32
+
+; GCN:    NumVgprs: 256
+; GFX900: ScratchSize: 148
+; GFX908: ScratchSize: 0
+; GCN:    VGPRBlocks: 63
+; GCN:    NumVGPRsForWavesPerEU: 256
+define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
+  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
+  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
+  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
+  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
+  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
+  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
+  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
+  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
+  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
+  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
+  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
+  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_write_b32 a0, v
+; GFX900:     buffer_store_dword v
+; GFX900:     buffer_load_dword v
+; GFX908-NOT: buffer_
+; GFX908-DAG  v_accvgpr_read_b32
+
+; GCN:    NumVgprs: 256
+; GFX900: ScratchSize: 580
+; GFX908: ScratchSize: 0
+; GCN:    VGPRBlocks: 63
+; GCN:    NumVGPRsForWavesPerEU: 256
+define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
+  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
+  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
+  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
+  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
+  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
+  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
+  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
+  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
+  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
+  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
+  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
+  br label %st
+
+st:
+  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }