diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 4e968b67869c..d73f2b4abae8 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -913,7 +913,6 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { return true; } - #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, Optional FramePointerSaveIndex) { @@ -947,6 +946,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->removeDeadFrameIndices(MFI); assert(allSGPRSpillsAreDead(MFI, None) && "SGPR spill should have been removed in SILowerSGPRSpills"); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 88d37992072a..f7c23b3d9fb8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -976,6 +976,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S256_SAVE; case 64: return AMDGPU::SI_SPILL_S512_SAVE; + case 128: + return AMDGPU::SI_SPILL_S1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -997,6 +999,25 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V256_SAVE; case 64: return AMDGPU::SI_SPILL_V512_SAVE; + case 128: + return AMDGPU::SI_SPILL_V1024_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getAGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_A32_SAVE; + case 8: + return AMDGPU::SI_SPILL_A64_SAVE; + case 16: + return AMDGPU::SI_SPILL_A128_SAVE; + case 64: + return AMDGPU::SI_SPILL_A512_SAVE; + case 128: + return AMDGPU::SI_SPILL_A1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -1055,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - - unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // data - .addFrameIndex(FrameIndex) // addr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + + auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); + if (RI.hasAGPRs(RC)) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MIB.addReg(Tmp, RegState::Define); + } + MIB.addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { @@ -1084,6 +1110,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S256_RESTORE; case 64: return AMDGPU::SI_SPILL_S512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_S1024_RESTORE; default: llvm_unreachable("unknown register size"); } @@ -1105,6 +1133,25 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V256_RESTORE; case 64: return AMDGPU::SI_SPILL_V512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_V1024_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_A32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_A64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_A128_RESTORE; + case 64: + return AMDGPU::SI_SPILL_A512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_A1024_RESTORE; default: llvm_unreachable("unknown register size"); } @@ -1156,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - - unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); + auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); + if (RI.hasAGPRs(RC)) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MIB.addReg(Tmp, RegState::Define); + } + MIB.addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4831ede3d542..05fdd3065aa0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -513,6 +513,7 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S160 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; +defm SI_SPILL_S1024 : SI_SPILL_SGPR ; multiclass SI_SPILL_VGPR { let UseNamedOperandTable = 1, VGPRSpill = 1, @@ -524,7 +525,9 @@ multiclass SI_SPILL_VGPR { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (8 * num_subregs) bytes maximum - let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); } def _RESTORE : VPseudoInstSI < @@ -535,7 +538,9 @@ multiclass SI_SPILL_VGPR { let mayLoad = 1; // (2 * 4) + (8 * num_subregs) bytes maximum - let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); } } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] } @@ -547,6 +552,44 @@ defm SI_SPILL_V128 : SI_SPILL_VGPR ; defm SI_SPILL_V160 : SI_SPILL_VGPR ; defm SI_SPILL_V256 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; +defm SI_SPILL_V1024 : SI_SPILL_VGPR ; + +multiclass SI_SPILL_AGPR { + let UseNamedOperandTable = 1, VGPRSpill = 1, + Constraints = "@earlyclobber $tmp", + SchedRW = [WriteVMEM] in { + def _SAVE : VPseudoInstSI < + (outs VGPR_32:$tmp), + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + SReg_32:$soffset, i32imm:$offset)> { + let mayStore = 1; + let mayLoad = 0; + // (2 * 4) + (16 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + + def _RESTORE : VPseudoInstSI < + (outs vgpr_class:$vdata, VGPR_32:$tmp), + (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, + i32imm:$offset)> { + let mayStore = 0; + let mayLoad = 1; + + // (2 * 4) + (16 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] +} + +defm SI_SPILL_A32 : SI_SPILL_AGPR ; +defm SI_SPILL_A64 : SI_SPILL_AGPR ; +defm SI_SPILL_A128 : SI_SPILL_AGPR ; +defm SI_SPILL_A512 : SI_SPILL_AGPR ; +defm SI_SPILL_A1024 : SI_SPILL_AGPR ; def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 7838a59b6338..abfe89491e7c 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -37,6 +37,12 @@ using MBBVector = SmallVector; namespace { +static cl::opt EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + class SILowerSGPRSpills : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; @@ -242,10 +248,22 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { return false; } + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + bool AllSGPRSpilledToVGPRs = false; + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + bool MadeChange = false; - if (TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) { + const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); + + // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be + // handled as SpilledToReg in regular PrologEpilogInserter. + if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) || + SpillVGPRToAGPR) { + AllSGPRSpilledToVGPRs = true; + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs // are spilled to VGPRs, in which case we can eliminate the stack usage. // @@ -257,6 +275,18 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; Next = std::next(I); + if (SpillToAGPR && TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata) + ->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) + TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr); + } + if (!TII->isSGPRSpill(MI)) continue; @@ -266,18 +296,24 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - } - + } else + AllSGPRSpilledToVGPRs = false; } } for (MachineBasicBlock &MBB : MF) { for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) MBB.addLiveIn(SSpill.VGPR); + + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + MBB.sortUniqueLiveIns(); } - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); MadeChange = true; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index a3f6caaacc83..46da974a2f45 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -319,7 +319,75 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return true; } -void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { +/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. +/// Either AGPR is spilled to VGPR to vice versa. +/// Returns true if a \p FI can be eliminated completely. +bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, + int FI, + bool isAGPRtoVGPR) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + + assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI)); + + auto &Spill = VGPRToAGPRSpills[FI]; + + // This has already been allocated. + if (!Spill.Lanes.empty()) + return Spill.FullyAllocated; + + unsigned Size = FrameInfo.getObjectSize(FI); + unsigned NumLanes = Size / 4; + Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister); + + const TargetRegisterClass &RC = + isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass; + auto Regs = RC.getRegisters(); + + auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + Spill.FullyAllocated = true; + + // FIXME: Move allocation logic out of MachineFunctionInfo and initialize + // once. + BitVector OtherUsedRegs; + OtherUsedRegs.resize(TRI->getNumRegs()); + + const uint32_t *CSRMask = + TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv()); + if (CSRMask) + OtherUsedRegs.setBitsInMask(CSRMask); + + // TODO: Should include register tuples, but doesn't matter with current + // usage. + for (MCPhysReg Reg : SpillAGPR) + OtherUsedRegs.set(Reg); + for (MCPhysReg Reg : SpillVGPR) + OtherUsedRegs.set(Reg); + + SmallVectorImpl::const_iterator NextSpillReg = Regs.begin(); + for (unsigned I = 0; I < NumLanes; ++I) { + NextSpillReg = std::find_if( + NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) { + return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && + !OtherUsedRegs[Reg]; + }); + + if (NextSpillReg == Regs.end()) { // Registers exhausted + Spill.FullyAllocated = false; + break; + } + + OtherUsedRegs.set(*NextSpillReg); + SpillRegs.push_back(*NextSpillReg); + Spill.Lanes[I] = *NextSpillReg++; + } + + return Spill.FullyAllocated; +} + +void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { // The FP spill hasn't been inserted yet, so keep it around. for (auto &R : SGPRToVGPRSpills) { if (R.first != FramePointerSaveIndex) @@ -332,6 +400,11 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) ++i) if (i != FramePointerSaveIndex) MFI.setStackID(i, TargetStackID::Default); + + for (auto &R : VGPRToAGPRSpills) { + if (R.second.FullyAllocated) + MFI.RemoveStackObject(R.first); + } } MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index a8928dacf774..f19b20ceb5da 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -442,6 +442,11 @@ public: SGPRSpillVGPRCSR(unsigned V, Optional F) : VGPR(V), FI(F) {} }; + struct VGPRSpillToAGPR { + SmallVector Lanes; + bool FullyAllocated = false; + }; + SparseBitVector<> WWMReservedRegs; void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } @@ -456,6 +461,14 @@ private: unsigned NumVGPRSpillLanes = 0; SmallVector SpillVGPRs; + DenseMap VGPRToAGPRSpills; + + // AGPRs used for VGPR spills. + SmallVector SpillAGPR; + + // VGPRs used for AGPR spills. + SmallVector SpillVGPR; + public: // FIXME /// If this is set, an SGPR used for save/restore of the register used for the /// frame pointer. @@ -477,6 +490,20 @@ public: return SpillVGPRs; } + ArrayRef getAGPRSpillVGPRs() const { + return SpillAGPR; + } + + ArrayRef getVGPRSpillAGPRs() const { + return SpillVGPR; + } + + MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const { + auto I = VGPRToAGPRSpills.find(FrameIndex); + return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister + : I->second.Lanes[Lane]; + } + AMDGPU::SIModeRegisterDefaults getMode() const { return Mode; } @@ -484,7 +511,8 @@ public: bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); - void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); + bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); + void removeDeadFrameIndices(MachineFrameInfo &MFI); bool hasCalculatedTID() const { return TIDReg != 0; }; unsigned getTIDReg() const { return TIDReg; }; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9fde16edade2..7c2839ccb4c0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -256,6 +256,13 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } + // FIXME: Stop using reserved registers for this. + for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) + reserveRegisterTuples(Reserved, Reg); + + for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) + reserveRegisterTuples(Reserved, Reg); + return Reserved; } @@ -448,10 +455,19 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( static unsigned getNumSubRegsForSpillOp(unsigned Op) { switch (Op) { + case AMDGPU::SI_SPILL_S1024_SAVE: + case AMDGPU::SI_SPILL_S1024_RESTORE: + case AMDGPU::SI_SPILL_V1024_SAVE: + case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_A1024_SAVE: + case AMDGPU::SI_SPILL_A1024_RESTORE: + return 32; case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V512_RESTORE: + case AMDGPU::SI_SPILL_A512_SAVE: + case AMDGPU::SI_SPILL_A512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -467,6 +483,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_A128_SAVE: + case AMDGPU::SI_SPILL_A128_RESTORE: return 4; case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S96_RESTORE: @@ -477,11 +495,15 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_A64_SAVE: + case AMDGPU::SI_SPILL_A64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: case AMDGPU::SI_SPILL_V32_SAVE: case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_A32_SAVE: + case AMDGPU::SI_SPILL_A32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -541,6 +563,35 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } +static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, + int Index, + unsigned Lane, + unsigned ValueReg, + bool IsKill) { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MI->getParent()->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); + + if (Reg == AMDGPU::NoRegister) + return MachineInstrBuilder(); + + bool IsStore = MI->mayStore(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + auto *TRI = static_cast(MRI.getTargetRegisterInfo()); + + unsigned Dst = IsStore ? Reg : ValueReg; + unsigned Src = IsStore ? ValueReg : Reg; + unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 + : AMDGPU::V_ACCVGPR_READ_B32; + + return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + .addReg(Src, getKillRegState(IsKill)); +} + // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, @@ -559,6 +610,9 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) + return true; + MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .add(*Reg) @@ -611,6 +665,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned Align = MFI.getObjectAlignment(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); + Register TmpReg = + hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() + : Register(); + assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); if (!isUInt<12>(Offset + Size - EltSize)) { @@ -659,21 +717,38 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); - MachineMemOperand *NewMMO - = MF->getMachineMemOperand(PInfo, MMO->getFlags(), - EltSize, MinAlign(Align, EltSize * i)); + auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); - auto MIB = BuildMI(*MBB, MI, DL, Desc) - .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg) - .addReg(SOffset, SOffsetRegState) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addMemOperand(NewMMO); + if (!MIB.getInstr()) { + unsigned FinalReg = SubReg; + if (TmpReg != AMDGPU::NoRegister) { + if (IsStore) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) + .addReg(SubReg, getKillRegState(IsKill)); + SubReg = TmpReg; + } + + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); + MachineMemOperand *NewMMO + = MF->getMachineMemOperand(PInfo, MMO->getFlags(), + EltSize, MinAlign(Align, EltSize * i)); + + MIB = BuildMI(*MBB, MI, DL, Desc) + .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) + .addReg(SOffset, SOffsetRegState) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(NewMMO); + + if (!IsStore && TmpReg != AMDGPU::NoRegister) + MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), + FinalReg) + .addReg(TmpReg, RegState::Kill); + } if (NumSubRegs > 1) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); @@ -1038,6 +1113,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( int FI, RegScavenger *RS) const { switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S160_SAVE: @@ -1046,6 +1122,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: return spillSGPR(MI, FI, RS, true); + case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S160_RESTORE: @@ -1080,6 +1157,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, switch (MI->getOpcode()) { // SGPR register spill + case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S160_SAVE: @@ -1092,6 +1170,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // SGPR register restore + case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S160_RESTORE: @@ -1104,13 +1183,19 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // VGPR register spill + case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V160_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: { + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_A1024_SAVE: + case AMDGPU::SI_SPILL_A512_SAVE: + case AMDGPU::SI_SPILL_A128_SAVE: + case AMDGPU::SI_SPILL_A64_SAVE: + case AMDGPU::SI_SPILL_A32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -1134,7 +1219,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: - case AMDGPU::SI_SPILL_V512_RESTORE: { + case AMDGPU::SI_SPILL_V512_RESTORE: + case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_A32_RESTORE: + case AMDGPU::SI_SPILL_A64_RESTORE: + case AMDGPU::SI_SPILL_A128_RESTORE: + case AMDGPU::SI_SPILL_A512_RESTORE: + case AMDGPU::SI_SPILL_A1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll new file mode 100644 index 000000000000..b12a7bc72a81 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -0,0 +1,108 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s + +; GCN-LABEL: {{^}}max_24regs_32a_used: +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; A2V-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: ScratchSize: 0 +define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) + %elt1 = extractelement <16 x float> %mai.2, i32 0 + %elt2 = extractelement <16 x float> %mai.1, i32 15 + %elt3 = extractelement <16 x float> %mai.1, i32 14 + %elt4 = extractelement <16 x float> %mai.2, i32 1 + store float %elt1, float addrspace(1)* %out + %gep1 = getelementptr float, float addrspace(1)* %out, i64 1 + store float %elt2, float addrspace(1)* %gep1 + %gep2 = getelementptr float, float addrspace(1)* %out, i64 2 + store float %elt3, float addrspace(1)* %gep2 + %gep3 = getelementptr float, float addrspace(1)* %out, i64 3 + store float %elt4, float addrspace(1)* %gep3 + + ret void +} + +; GCN-LABEL: {{^}}max_12regs_13a_used: +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; A2V-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4 +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: ScratchSize: 0 +define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) + br label %use + +use: + call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5) + store <4 x float> , <4 x float> addrspace(1)* %out + br label %st + +st: + %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16 + %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32 + store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep1 + store <4 x float> %mai.2, <4 x float> addrspace(1)* %gep2 + ret void +} + +; GCN-LABEL: {{^}}max_10_vgprs_used_9a: +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; A2V-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: ScratchSize: 0 +define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4) + call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9) + ret void +} + +; GCN-LABEL: {{^}}max_32regs_mfma32: +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; A2V-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: ScratchSize: 0 +define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 { +bb: + %v = call i32 asm sideeffect "", "=a"() + br label %use + +use: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> , i32 0, i32 0, i32 0) + call void asm sideeffect "", "a"(i32 %v) + %elt1 = extractelement <32 x i32> %mai.1, i32 0 + store i32 %elt1, i32 addrspace(1)* %arg + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) + +attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } +attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll new file mode 100644 index 000000000000..b101e41833b8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -0,0 +1,288 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s + +; GCN-LABEL: {{^}}max_10_vgprs: +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} +; GFX900: buffer_store_dword v{{[0-9]}}, +; GFX900: buffer_store_dword v{{[0-9]}}, +; GFX900: buffer_load_dword v{{[0-9]}}, +; GFX900: buffer_load_dword v{{[0-9]}}, +; GFX908-NOT: buffer_ +; GFX908-DAG v_accvgpr_read_b32 v{{[0-9]}}, a0 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 + +; GCN: NumVgprs: 10 +; GFX900: ScratchSize: 12 +; GFX908: ScratchSize: 0 +; GCN: VGPRBlocks: 2 +; GCN: NumVGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 + %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8 + %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12 + %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16 + %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20 + %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24 + %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28 + %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32 + %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36 + %v1 = load volatile i32, i32 addrspace(1)* %p1 + %v2 = load volatile i32, i32 addrspace(1)* %p2 + %v3 = load volatile i32, i32 addrspace(1)* %p3 + %v4 = load volatile i32, i32 addrspace(1)* %p4 + %v5 = load volatile i32, i32 addrspace(1)* %p5 + %v6 = load volatile i32, i32 addrspace(1)* %p6 + %v7 = load volatile i32, i32 addrspace(1)* %p7 + %v8 = load volatile i32, i32 addrspace(1)* %p8 + %v9 = load volatile i32, i32 addrspace(1)* %p9 + %v10 = load volatile i32, i32 addrspace(1)* %p10 + call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) + store volatile i32 %v1, i32 addrspace(1)* undef + store volatile i32 %v2, i32 addrspace(1)* undef + store volatile i32 %v3, i32 addrspace(1)* undef + store volatile i32 %v4, i32 addrspace(1)* undef + store volatile i32 %v5, i32 addrspace(1)* undef + store volatile i32 %v6, i32 addrspace(1)* undef + store volatile i32 %v7, i32 addrspace(1)* undef + store volatile i32 %v8, i32 addrspace(1)* undef + store volatile i32 %v9, i32 addrspace(1)* undef + store volatile i32 %v10, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}max_10_vgprs_used_9a: +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GCN: buffer_store_dword v{{[0-9]}}, +; GFX900: buffer_store_dword v{{[0-9]}}, +; GFX900: buffer_load_dword v{{[0-9]}}, +; GFX900: buffer_load_dword v{{[0-9]}}, +; GFX908-NOT: buffer_ +; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 +; GFX908: buffer_load_dword v{{[0-9]}}, +; GFX908-NOT: buffer_ + +; GCN: NumVgprs: 10 +; GFX900: ScratchSize: 12 +; GFX908: ScratchSize: 8 +; GCN: VGPRBlocks: 2 +; GCN: NumVGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 + %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8 + %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12 + %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16 + %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20 + %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24 + %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28 + %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32 + %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36 + %v1 = load volatile i32, i32 addrspace(1)* %p1 + %v2 = load volatile i32, i32 addrspace(1)* %p2 + %v3 = load volatile i32, i32 addrspace(1)* %p3 + %v4 = load volatile i32, i32 addrspace(1)* %p4 + %v5 = load volatile i32, i32 addrspace(1)* %p5 + %v6 = load volatile i32, i32 addrspace(1)* %p6 + %v7 = load volatile i32, i32 addrspace(1)* %p7 + %v8 = load volatile i32, i32 addrspace(1)* %p8 + %v9 = load volatile i32, i32 addrspace(1)* %p9 + %v10 = load volatile i32, i32 addrspace(1)* %p10 + call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) + store volatile i32 %v1, i32 addrspace(1)* undef + store volatile i32 %v2, i32 addrspace(1)* undef + store volatile i32 %v3, i32 addrspace(1)* undef + store volatile i32 %v4, i32 addrspace(1)* undef + store volatile i32 %v5, i32 addrspace(1)* undef + store volatile i32 %v6, i32 addrspace(1)* undef + store volatile i32 %v7, i32 addrspace(1)* undef + store volatile i32 %v8, i32 addrspace(1)* undef + store volatile i32 %v9, i32 addrspace(1)* undef + store volatile i32 %v10, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill: +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-DAG: v_accvgpr_write_b32 a0, 1 +; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GFX900: buffer_store_dword v{{[0-9]}}, +; GCN-DAG: buffer_store_dword v{{[0-9]}}, +; GFX900: buffer_load_dword v{{[0-9]}}, +; GCN-DAG: buffer_load_dword v{{[0-9]}}, +; GFX908-DAG v_accvgpr_read_b32 v{{[0-9]}}, a1 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 + +; GCN: NumVgprs: 10 +; GFX900: ScratchSize: 44 +; GFX908: ScratchSize: 20 +; GCN: VGPRBlocks: 2 +; GCN: NumVGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + call void asm sideeffect "", "a"(i32 1) + %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8 + %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16 + %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24 + %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32 + %v1 = load volatile i64, i64 addrspace(1)* %p1 + %v2 = load volatile i64, i64 addrspace(1)* %p2 + %v3 = load volatile i64, i64 addrspace(1)* %p3 + %v4 = load volatile i64, i64 addrspace(1)* %p4 + %v5 = load volatile i64, i64 addrspace(1)* %p5 + call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5) + store volatile i64 %v1, i64 addrspace(1)* %p2 + store volatile i64 %v2, i64 addrspace(1)* %p3 + store volatile i64 %v3, i64 addrspace(1)* %p4 + store volatile i64 %v4, i64 addrspace(1)* %p5 + store volatile i64 %v5, i64 addrspace(1)* %p1 + ret void +} + +; GCN-LABEL: {{^}}max_10_vgprs_spill_v32: +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-DAG: v_accvgpr_write_b32 a0, +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GCN-NOT: a10 +; GCN: buffer_store_dword v{{[0-9]}}, + +; GFX908: NumVgprs: 10 +; GFX900: ScratchSize: 100 +; GFX908: ScratchSize: 68 +; GFX908: VGPRBlocks: 2 +; GFX908: NumVGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid + %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep + store volatile <32 x float> %v, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32: +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_write_b32 a0, v +; GFX900: buffer_store_dword v +; GFX900: buffer_load_dword v +; GFX908-NOT: buffer_ +; GFX908-DAG v_accvgpr_read_b32 + +; GCN: NumVgprs: 256 +; GFX900: ScratchSize: 148 +; GFX908: ScratchSize: 0 +; GCN: VGPRBlocks: 63 +; GCN: NumVGPRsForWavesPerEU: 256 +define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid + %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid + %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid + %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid + %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid + %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid + %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid + %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid + %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1 + %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2 + %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3 + %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4 + %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5 + %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6 + %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7 + %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8 + %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9 + store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-NOT: SCRATCH_RSRC +; GFX908-DAG: v_accvgpr_write_b32 a0, v +; GFX900: buffer_store_dword v +; GFX900: buffer_load_dword v +; GFX908-NOT: buffer_ +; GFX908-DAG v_accvgpr_read_b32 + +; GCN: NumVgprs: 256 +; GFX900: ScratchSize: 580 +; GFX908: ScratchSize: 0 +; GCN: VGPRBlocks: 63 +; GCN: NumVGPRsForWavesPerEU: 256 +define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid + %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid + %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid + %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid + %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid + %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid + %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid + %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid + %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1 + %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2 + %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3 + %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4 + %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5 + %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6 + %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7 + %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8 + %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9 + br label %st + +st: + store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef + store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }