AMDGPU: Don't use stack space for SGPR->VGPR spills

Before frame offsets are calculated, try to eliminate the
frame indexes used by SGPR spills. Then we can delete them
after.

I think for now we can be sure that no other instruction
will be re-using the same frame indexes. It should be easy
to notice if this assumption ever breaks since everything
asserts if it tries to use a dead frame index later.

The unused emergency stack slot seems to still be left behind,
so an additional 4 bytes is still wasted.

llvm-svn: 295753
This commit is contained in:
Matt Arsenault 2017-02-21 19:12:08 +00:00
parent ebfe01c121
commit e0bf7d02f0
11 changed files with 870 additions and 88 deletions

View File

@ -128,13 +128,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
// Skip the last 2 elements because the last one is reserved for VCC, and
// this is the 2nd to last element already.
// Skip the last N reserved elements because they should have already been
// reserved for VCC etc.
for (MCPhysReg Reg : AllSGPR128s) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
//assert(MRI.isAllocatable(Reg));
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
@ -157,7 +156,6 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
@ -393,17 +391,45 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (!MFI.hasStackObjects())
return;
bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
assert(RS && "RegScavenger required if spilling");
int ScavengeFI = MFI.CreateStackObject(
AMDGPU::SGPR_32RegClass.getSize(),
AMDGPU::SGPR_32RegClass.getAlignment(), false);
RS->addScavengingFrameIndex(ScavengeFI);
assert((RS || !MayNeedScavengingEmergencySlot) &&
"RegScavenger required if spilling");
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
if (!TRI.spillSGPRToVGPR())
return;
if (MayNeedScavengingEmergencySlot) {
int ScavengeFI = MFI.CreateStackObject(
AMDGPU::SGPR_32RegClass.getSize(),
AMDGPU::SGPR_32RegClass.getAlignment(), false);
RS->addScavengingFrameIndex(ScavengeFI);
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
if (!FuncInfo->hasSpilledSGPRs())
return;
// Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
// are spilled to VGPRs, in which case we can eliminate the stack usage.
//
// XXX - This operates under the assumption that only other SGPR spills are
// users of the frame index. I'm not 100% sure this is correct. The
// StackColoring pass has a comment saying a future improvement would be to
// merging of allocas with spill slots, but for now according to
// MachineFrameInfo isSpillSlot can't alias any other object.
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator Next;
for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
MachineInstr &MI = *I;
Next = std::next(I);
if (TII->isSGPRSpill(MI)) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI))
TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
}
}
}
FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
}
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,

View File

@ -4673,6 +4673,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
case 256:
return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
case 512:
return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
}
case 'v':

View File

@ -36,7 +36,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
cl::desc("Restrict range of branch instructions (DEBUG)"));
SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
: AMDGPUInstrInfo(ST), RI(), ST(ST) {}
: AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks

View File

@ -20,12 +20,6 @@
using namespace llvm;
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
cl::desc("Enable spilling VGPRs to SGPRs"),
cl::ReallyHidden,
cl::init(true));
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
@ -193,45 +187,60 @@ unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
return PrivateMemoryPtrUserSGPR;
}
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
MachineFunction *MF,
unsigned FrameIndex,
unsigned SubIdx) {
if (!EnableSpillSGPRToVGPR)
return SpilledReg();
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int FI) {
std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
// This has already been allocated.
if (!SpillLanes.empty())
return true;
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned WaveSize = ST.getWavefrontSize();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
Offset += SubIdx * 4;
unsigned Size = FrameInfo.getObjectSize(FI);
assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
unsigned LaneVGPRIdx = Offset / (64 * 4);
unsigned Lane = (Offset / 4) % 64;
int NumLanes = Size / 4;
struct SpilledReg Spill;
Spill.Lane = Lane;
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
unsigned LaneVGPR;
unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
if (!LaneVGPRs.count(LaneVGPRIdx)) {
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
*MF);
if (VGPRIndex == 0) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we won't
// partially spill the SGPR to VGPRs.
SGPRToVGPRSpills.erase(FI);
NumVGPRSpillLanes -= I;
return false;
}
if (LaneVGPR == AMDGPU::NoRegister)
// We have no VGPRs left for spilling SGPRs.
return Spill;
SpillVGPRs.push_back(LaneVGPR);
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
BI != BE; ++BI) {
BI->addLiveIn(LaneVGPR);
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
for (MachineBasicBlock &BB : MF)
BB.addLiveIn(LaneVGPR);
} else {
LaneVGPR = SpillVGPRs.back();
}
SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
}
Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
return Spill;
return true;
}
void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
for (auto &R : SGPRToVGPRSpills)
MFI.RemoveStackObject(R.first);
}

View File

@ -134,7 +134,8 @@ public:
// FIXME: Make private
unsigned LDSWaveSpillSize;
unsigned PSInputEna;
std::map<unsigned, unsigned> LaneVGPRs;
unsigned ScratchOffsetReg;
unsigned NumUserSGPRs;
unsigned NumSystemSGPRs;
@ -195,12 +196,29 @@ public:
bool hasReg() { return VGPR != AMDGPU::NoRegister;}
};
// SIMachineFunctionInfo definition
private:
// SGPR->VGPR spilling support.
typedef std::pair<unsigned, unsigned> SpillRegMask;
// Track VGPR + wave index for each subregister of the SGPR spilled to
// frameindex key.
DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
unsigned NumVGPRSpillLanes = 0;
SmallVector<unsigned, 2> SpillVGPRs;
public:
SIMachineFunctionInfo(const MachineFunction &MF);
SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
unsigned SubIdx);
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
auto I = SGPRToVGPRSpills.find(FrameIndex);
return (I == SGPRToVGPRSpills.end()) ?
ArrayRef<SpilledReg>() : makeArrayRef(I->second);
}
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }

View File

@ -24,12 +24,6 @@
using namespace llvm;
static cl::opt<bool> EnableSpillSGPRToSMEM(
"amdgpu-spill-sgpr-to-smem",
cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
cl::init(false));
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)
@ -49,9 +43,28 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
}
}
SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()) {
static cl::opt<bool> EnableSpillSGPRToSMEM(
"amdgpu-spill-sgpr-to-smem",
cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
cl::init(false));
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
cl::desc("Enable spilling VGPRs to SGPRs"),
cl::ReallyHidden,
cl::init(true));
SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
AMDGPURegisterInfo(),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
SpillSGPRToVGPR(false),
SpillSGPRToSMEM(false) {
if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
SpillSGPRToSMEM = true;
else if (EnableSpillSGPRToVGPR)
SpillSGPRToVGPR = true;
unsigned NumRegPressureSets = getNumRegPressureSets();
SGPRSetID = NumRegPressureSets;
@ -557,11 +570,20 @@ static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
}
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS) const {
RegScavenger *RS,
bool OnlyToVGPR) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
MachineRegisterInfo &MRI = MF->getRegInfo();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@ -570,10 +592,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
bool SpillToSMEM = spillSGPRToSMEM();
if (SpillToSMEM && OnlyToVGPR)
return false;
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
@ -646,9 +669,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
continue;
}
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
if (Spill.hasReg()) {
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
@ -659,6 +682,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
// frame index, we should delete the frame index when all references to
// it are fixed.
} else {
// XXX - Can to VGPR spill fail for some subregisters but not others?
if (OnlyToVGPR)
return false;
// Spill SGPR to a frame index.
// TODO: Should VI try to spill to VGPR and then spill to SMEM?
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@ -702,22 +729,33 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
MFI->addToSpilledSGPRs(NumSubRegs);
return true;
}
void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS) const {
RegScavenger *RS,
bool OnlyToVGPR) const {
MachineFunction *MF = MI->getParent()->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
unsigned SuperReg = MI->getOperand(0).getReg();
bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
bool SpillToSMEM = spillSGPRToSMEM();
if (SpillToSMEM && OnlyToVGPR)
return false;
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
@ -785,10 +823,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
continue;
}
SIMachineFunctionInfo::SpilledReg Spill
= MFI->getSpilledReg(MF, Index, i);
if (Spill.hasReg()) {
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
auto MIB =
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
SubReg)
@ -798,6 +834,9 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
if (NumSubRegs > 1)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
} else {
if (OnlyToVGPR)
return false;
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@ -832,6 +871,32 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
}
MI->eraseFromParent();
return true;
}
/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
/// a VGPR and the stack slot can be safely eliminated when all other users are
/// handled.
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
MachineBasicBlock::iterator MI,
int FI,
RegScavenger *RS) const {
switch (MI->getOpcode()) {
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
return spillSGPR(MI, FI, RS, true);
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
return restoreSGPR(MI, FI, RS, true);
default:
llvm_unreachable("not an SGPR spill instruction");
}
}
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,

View File

@ -21,8 +21,8 @@
namespace llvm {
class SISubtarget;
class MachineRegisterInfo;
class SISubtarget;
class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPURegisterInfo {
@ -31,13 +31,22 @@ private:
unsigned VGPRSetID;
BitVector SGPRPressureSets;
BitVector VGPRPressureSets;
bool SpillSGPRToVGPR;
bool SpillSGPRToSMEM;
void reserveRegisterTuples(BitVector &, unsigned Reg) const;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
BitVector &PressureSets) const;
public:
SIRegisterInfo();
SIRegisterInfo(const SISubtarget &ST);
bool spillSGPRToVGPR() const {
return SpillSGPRToVGPR;
}
bool spillSGPRToSMEM() const {
return SpillSGPRToSMEM;
}
/// Return the end register initially reserved for the scratch buffer in case
/// spilling is needed.
@ -78,16 +87,22 @@ public:
const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;
void spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS) const;
/// If \p OnlyToVGPR is true, this will only succeed if this
bool spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
bool OnlyToVGPR = false) const;
void restoreSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS) const;
bool restoreSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
bool OnlyToVGPR = false) const;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS) const;
unsigned getHWRegIndex(unsigned Reg) const {
return getEncodingValue(Reg) & 0xff;
}

View File

@ -325,7 +325,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
let AllocationPriority = 11;
}
def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
let AllocationPriority = 12;

View File

@ -10,6 +10,8 @@
; GCN-LABEL: {{^}}divergent_if_endif:
; VGPR: workitem_private_segment_byte_size = 12{{$}}
; GCN: {{^}}; BB#0:
; GCN: s_mov_b32 m0, -1
@ -31,7 +33,9 @@
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VGPR: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
; GCN: s_waitcnt vmcnt(0) expcnt(0)
@ -40,7 +44,8 @@
; GCN: {{^}}BB{{[0-9]+}}_1: ; %if
; GCN: s_mov_b32 m0, -1
; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; VMEM: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; VGPR: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; GCN: s_waitcnt vmcnt(0)
; Spill val register
@ -88,6 +93,8 @@ endif:
}
; GCN-LABEL: {{^}}divergent_loop:
; VGPR: workitem_private_segment_byte_size = 16{{$}}
; GCN: {{^}}; BB#0:
; GCN: s_mov_b32 m0, -1

View File

@ -0,0 +1,635 @@
; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s
; Spill all SGPRs so multiple VGPRs are required for spilling all of them.
; Ideally we only need 2 VGPRs for all spilling. The VGPRs are
; allocated per-frame index, so it's possible to get up with more.
; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
; GCN: def s[8:15]
; GCN: def s[16:23]
; GCN: def s[24:31]
; GCN: def s[32:39]
; GCN: def s[40:47]
; GCN: def s[48:55]
; GCN: def s[56:63]
; GCN: def s[64:71]
; GCN: def s[72:79]
; GCN: def s[80:87]
; GCN: def s[88:95]
; GCN: v_writelane_b32 v0, s8, 0
; GCN-NEXT: v_writelane_b32 v0, s9, 1
; GCN-NEXT: v_writelane_b32 v0, s10, 2
; GCN-NEXT: v_writelane_b32 v0, s11, 3
; GCN-NEXT: v_writelane_b32 v0, s12, 4
; GCN-NEXT: v_writelane_b32 v0, s13, 5
; GCN-NEXT: v_writelane_b32 v0, s14, 6
; GCN-NEXT: v_writelane_b32 v0, s15, 7
; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 9
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
; GCN-NEXT: v_writelane_b32 v0, s13, 13
; GCN-NEXT: v_writelane_b32 v0, s14, 14
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 16
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 17
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
; GCN-NEXT: v_writelane_b32 v0, s13, 21
; GCN-NEXT: v_writelane_b32 v0, s14, 22
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 24
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 25
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
; GCN-NEXT: v_writelane_b32 v0, s13, 29
; GCN-NEXT: v_writelane_b32 v0, s14, 30
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 32
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 33
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
; GCN-NEXT: v_writelane_b32 v0, s13, 37
; GCN-NEXT: v_writelane_b32 v0, s14, 38
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 40
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 41
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
; GCN-NEXT: v_writelane_b32 v0, s13, 45
; GCN-NEXT: v_writelane_b32 v0, s14, 46
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
; GCN-NEXT: v_writelane_b32 v0, s13, 53
; GCN-NEXT: v_writelane_b32 v0, s14, 54
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
; GCN-NEXT: v_writelane_b32 v0, s88, 56
; GCN-NEXT: v_writelane_b32 v0, s89, 57
; GCN-NEXT: v_writelane_b32 v0, s90, 58
; GCN-NEXT: v_writelane_b32 v0, s91, 59
; GCN-NEXT: v_writelane_b32 v0, s92, 60
; GCN-NEXT: v_writelane_b32 v0, s93, 61
; GCN-NEXT: v_writelane_b32 v0, s94, 62
; GCN-NEXT: v_writelane_b32 v0, s95, 63
; GCN-NEXT: v_writelane_b32 v1, s16, 0
; GCN-NEXT: v_writelane_b32 v1, s17, 1
; GCN-NEXT: v_writelane_b32 v1, s18, 2
; GCN-NEXT: v_writelane_b32 v1, s19, 3
; GCN-NEXT: v_writelane_b32 v1, s20, 4
; GCN-NEXT: v_writelane_b32 v1, s21, 5
; GCN-NEXT: v_writelane_b32 v1, s22, 6
; GCN-NEXT: v_writelane_b32 v1, s23, 7
; GCN-NEXT: v_writelane_b32 v1, s24, 8
; GCN-NEXT: v_writelane_b32 v1, s25, 9
; GCN-NEXT: v_writelane_b32 v1, s26, 10
; GCN-NEXT: v_writelane_b32 v1, s27, 11
; GCN-NEXT: v_writelane_b32 v1, s28, 12
; GCN-NEXT: v_writelane_b32 v1, s29, 13
; GCN-NEXT: v_writelane_b32 v1, s30, 14
; GCN-NEXT: v_writelane_b32 v1, s31, 15
; GCN-NEXT: v_writelane_b32 v1, s32, 16
; GCN-NEXT: v_writelane_b32 v1, s33, 17
; GCN-NEXT: v_writelane_b32 v1, s34, 18
; GCN-NEXT: v_writelane_b32 v1, s35, 19
; GCN-NEXT: v_writelane_b32 v1, s36, 20
; GCN-NEXT: v_writelane_b32 v1, s37, 21
; GCN-NEXT: v_writelane_b32 v1, s38, 22
; GCN-NEXT: v_writelane_b32 v1, s39, 23
; GCN-NEXT: v_writelane_b32 v1, s40, 24
; GCN-NEXT: v_writelane_b32 v1, s41, 25
; GCN-NEXT: v_writelane_b32 v1, s42, 26
; GCN-NEXT: v_writelane_b32 v1, s43, 27
; GCN-NEXT: v_writelane_b32 v1, s44, 28
; GCN-NEXT: v_writelane_b32 v1, s45, 29
; GCN-NEXT: v_writelane_b32 v1, s46, 30
; GCN-NEXT: v_writelane_b32 v1, s47, 31
; GCN-NEXT: v_writelane_b32 v1, s48, 32
; GCN-NEXT: v_writelane_b32 v1, s49, 33
; GCN-NEXT: v_writelane_b32 v1, s50, 34
; GCN-NEXT: v_writelane_b32 v1, s51, 35
; GCN-NEXT: v_writelane_b32 v1, s52, 36
; GCN-NEXT: v_writelane_b32 v1, s53, 37
; GCN-NEXT: v_writelane_b32 v1, s54, 38
; GCN-NEXT: v_writelane_b32 v1, s55, 39
; GCN-NEXT: v_writelane_b32 v1, s56, 40
; GCN-NEXT: v_writelane_b32 v1, s57, 41
; GCN-NEXT: v_writelane_b32 v1, s58, 42
; GCN-NEXT: v_writelane_b32 v1, s59, 43
; GCN-NEXT: v_writelane_b32 v1, s60, 44
; GCN-NEXT: v_writelane_b32 v1, s61, 45
; GCN-NEXT: v_writelane_b32 v1, s62, 46
; GCN-NEXT: v_writelane_b32 v1, s63, 47
; GCN-NEXT: v_writelane_b32 v1, s64, 48
; GCN-NEXT: v_writelane_b32 v1, s65, 49
; GCN-NEXT: v_writelane_b32 v1, s66, 50
; GCN-NEXT: v_writelane_b32 v1, s67, 51
; GCN-NEXT: v_writelane_b32 v1, s68, 52
; GCN-NEXT: v_writelane_b32 v1, s69, 53
; GCN-NEXT: v_writelane_b32 v1, s70, 54
; GCN-NEXT: v_writelane_b32 v1, s71, 55
; GCN-NEXT: v_writelane_b32 v1, s72, 56
; GCN-NEXT: v_writelane_b32 v1, s73, 57
; GCN-NEXT: v_writelane_b32 v1, s74, 58
; GCN-NEXT: v_writelane_b32 v1, s75, 59
; GCN-NEXT: v_writelane_b32 v1, s76, 60
; GCN-NEXT: v_writelane_b32 v1, s77, 61
; GCN-NEXT: v_writelane_b32 v1, s78, 62
; GCN-NEXT: v_writelane_b32 v1, s79, 63
; GCN-NEXT: v_writelane_b32 v2, s80, 0
; GCN-NEXT: v_writelane_b32 v2, s81, 1
; GCN-NEXT: v_writelane_b32 v2, s82, 2
; GCN-NEXT: v_writelane_b32 v2, s83, 3
; GCN-NEXT: v_writelane_b32 v2, s84, 4
; GCN-NEXT: v_writelane_b32 v2, s85, 5
; GCN-NEXT: v_writelane_b32 v2, s86, 6
; GCN-NEXT: v_writelane_b32 v2, s87, 7
; GCN: s_cbranch_scc1
; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 0
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 1
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 2
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 3
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 4
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 5
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 6
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 0
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 1
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 2
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 3
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 4
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 5
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 6
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 7
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 8
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 9
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 10
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 11
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 12
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 13
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 14
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 15
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 16
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 17
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 18
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 19
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 20
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 21
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 22
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 23
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 24
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 25
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 26
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 27
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 28
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 29
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 30
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 31
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 32
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 33
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 34
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 35
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 36
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 37
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 38
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 39
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 40
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 41
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 42
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 43
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 44
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 45
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 46
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 47
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 48
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 49
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 50
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 51
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 52
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 53
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 54
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 55
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 56
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 57
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 58
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 59
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 60
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 61
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 62
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 63
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 56
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 63
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 8
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 9
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 10
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 11
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 12
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 13
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 14
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 15
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 16
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 17
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 18
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 19
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 20
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 21
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 22
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 23
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 24
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 25
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 26
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 27
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 28
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 29
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 30
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 31
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 32
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 33
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 34
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 35
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 36
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 37
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 38
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 39
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 40
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 41
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 42
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 43
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 44
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 45
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 46
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 47
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 48
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 55
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
%wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr4 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr5 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr6 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr7 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr8 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr9 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr10 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr11 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr12 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr13 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr14 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr15 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr16 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%cmp = icmp eq i32 %in, 0
br i1 %cmp, label %bb0, label %ret
bb0:
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr0) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr1) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr2) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr4) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr5) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr6) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr7) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr8) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr9) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr10) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr11) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr12) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr13) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr14) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr15) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr16) #0
br label %ret
ret:
ret void
}
; Some of the lanes of an SGPR spill are in one VGPR and some forced
; into the next available VGPR.
; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
; GCN: def s[24:39]
; GCN: v_writelane_b32 v0, s24, 50
; GCN-NEXT: v_writelane_b32 v0, s25, 51
; GCN-NEXT: v_writelane_b32 v0, s26, 52
; GCN-NEXT: v_writelane_b32 v0, s27, 53
; GCN-NEXT: v_writelane_b32 v0, s28, 54
; GCN-NEXT: v_writelane_b32 v0, s29, 55
; GCN-NEXT: v_writelane_b32 v0, s30, 56
; GCN-NEXT: v_writelane_b32 v0, s31, 57
; GCN-NEXT: v_writelane_b32 v0, s32, 58
; GCN-NEXT: v_writelane_b32 v0, s33, 59
; GCN-NEXT: v_writelane_b32 v0, s34, 60
; GCN-NEXT: v_writelane_b32 v0, s35, 61
; GCN-NEXT: v_writelane_b32 v0, s36, 62
; GCN-NEXT: v_writelane_b32 v0, s37, 63
; GCN-NEXT: v_writelane_b32 v1, s38, 0
; GCN-NEXT: v_writelane_b32 v1, s39, 1
; GCN: v_readlane_b32 s4, v0, 50
; GCN-NEXT: v_readlane_b32 s5, v0, 51
; GCN-NEXT: v_readlane_b32 s6, v0, 52
; GCN-NEXT: v_readlane_b32 s7, v0, 53
; GCN-NEXT: v_readlane_b32 s8, v0, 54
; GCN-NEXT: v_readlane_b32 s9, v0, 55
; GCN-NEXT: v_readlane_b32 s10, v0, 56
; GCN-NEXT: v_readlane_b32 s11, v0, 57
; GCN-NEXT: v_readlane_b32 s12, v0, 58
; GCN-NEXT: v_readlane_b32 s13, v0, 59
; GCN-NEXT: v_readlane_b32 s14, v0, 60
; GCN-NEXT: v_readlane_b32 s15, v0, 61
; GCN-NEXT: v_readlane_b32 s16, v0, 62
; GCN-NEXT: v_readlane_b32 s17, v0, 63
; GCN-NEXT: v_readlane_b32 s18, v1, 0
; GCN-NEXT: v_readlane_b32 s19, v1, 1
define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
%wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
%cmp = icmp eq i32 %in, 0
br i1 %cmp, label %bb0, label %ret
bb0:
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0
br label %ret
ret:
ret void
}
; The first 64 SGPR spills can go to a VGPR, but there isn't a second
; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element.
; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill:
; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15
; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21
; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31
; GCN: def s[0:1]
; GCN: v_writelane_b32 v23, s0, 32
; GCN-NEXT: v_writelane_b32 v23, s1, 33
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48
; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: s_cbranch_scc1
; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 34
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 38
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 39
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 40
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 41
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 42
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 43
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 47
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 48
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 49
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29
; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30
; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: v_readlane_b32 s0, v23, 32
; GCN: v_readlane_b32 s1, v23, 33
; GCN: ;;#ASMSTART
; GCN: ; use s[0:1]
define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" () #0
call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" () #0
call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19}"() #0
call void asm sideeffect "", "~{VGPR20_VGPR21}"() #0
call void asm sideeffect "", "~{VGPR22}"() #0
%wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
%cmp = icmp eq i32 %in, 0
br i1 %cmp, label %bb0, label %ret
bb0:
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0
call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
br label %ret
ret:
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=TOVGPR %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling,-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
; These tests check that the compiler won't crash when it needs to spill
; SGPRs.
@ -17,10 +17,14 @@
; CHECK: s_mov_b32 m0
; Make sure scratch space isn't being used for SGPR->VGPR spills
; FIXME: Seem to be leaving behind unused emergency slot.
; Writing to M0 from an SMRD instruction will hang the GPU.
; CHECK-NOT: s_buffer_load_dword m0
; CHECK: s_endpgm
; TOVGPR: ScratchSize: 4{{$}}
define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
main_body:
%tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
@ -768,6 +772,7 @@ ENDIF66: ; preds = %LOOP65
; CHECK-LABEL: {{^}}main1:
; CHECK: s_endpgm
; TOVGPR: ScratchSize: 4{{$}}
define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
main_body:
%tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0