[AMDGPU] Do not reserve any VGPR for SGPR spills

After the split register allocation changes in eebe841a47 it is no
longer necessary to reserve a VGPR before RA. This can also create bugs
when IPRA is enabled since we cannot predict that a called function may
not reserve any register if it does not have any SGPR spills. If that
happens those functions may override reserved registers that are
normally callee saved. Added a test to show this.

Fixes: SWDEV-309900

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D115551
This commit is contained in:
Austin Kerbow 2022-01-11 22:11:17 -08:00
parent bbced74199
commit 8470bf2b08
7 changed files with 223 additions and 229 deletions

View File

@ -1320,16 +1320,14 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
const BitVector AllSavedRegs = SavedRegs;
SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
// If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
// We have to anticipate introducing CSR VGPR spills or spill of caller
// save VGPR reserved for SGPR spills as we now always create stack entry
// for it, if we don't have any stack objects already, since we require
// an FP if there is a call and stack.
// for it, if we don't have any stack objects already, since we require a FP
// if there is a call and stack. We will allocate a VGPR for SGPR spills if
// there are any SGPR spills. Whether they are CSR spills or otherwise.
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const bool WillHaveFP =
FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill);
FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
// FP will be specially managed like SP.
if (WillHaveFP || hasFP(MF))

View File

@ -45,10 +45,6 @@ static cl::opt<bool> DisableLoopAlignment(
cl::desc("Do not align and prefetch loops"),
cl::init(false));
static cl::opt<bool> VGPRReserveforSGPRSpill(
"amdgpu-reserve-vgpr-for-sgpr-spill",
cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
static cl::opt<bool> UseDivergentRegisterIndexing(
"amdgpu-use-divergent-register-indexing",
cl::Hidden,
@ -11990,13 +11986,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
TargetLoweringBase::finalizeLowering(MF);
// Allocate a VGPR for future SGPR Spill if
// "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
// FIXME: We won't need this hack if we split SGPR allocation from VGPR
if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
!Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
Info->reserveVGPRforSGPRSpills(MF);
}
void SITargetLowering::computeKnownBitsForFrameIndex(

View File

@ -239,50 +239,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}
// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
static bool lowerShiftReservedVGPR(MachineFunction &MF,
const GCNSubtarget &ST) {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
// Early out if pre-reservation of a VGPR for SGPR spilling is disabled.
if (!PreReservedVGPR)
return false;
// If there are no free lower VGPRs available, default to using the
// pre-reserved register instead.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Register LowestAvailableVGPR =
TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF);
if (!LowestAvailableVGPR)
LowestAvailableVGPR = PreReservedVGPR;
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
// Create a stack object for a possible spill in the function prologue.
// Note Non-CSR VGPR also need this as we may overwrite inactive lanes.
Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4));
// Find saved info about the pre-reserved register.
const auto *ReservedVGPRInfoItr =
llvm::find_if(FuncInfo->getSGPRSpillVGPRs(),
[PreReservedVGPR](const auto &SpillRegInfo) {
return SpillRegInfo.VGPR == PreReservedVGPR;
});
assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end());
auto Index =
std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr);
FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
for (MachineBasicBlock &MBB : MF) {
assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
MBB.addLiveIn(LowestAvailableVGPR);
MBB.sortUniqueLiveIns();
}
return true;
}
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@ -304,11 +260,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
if (!MFI.hasStackObjects() && !HasCSRs) {
SaveBlocks.clear();
RestoreBlocks.clear();
if (FuncInfo->VGPRReservedForSGPRSpill) {
// Free the reserved VGPR for later possible use by frame lowering.
FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
MRI.freezeReservedRegs(MF);
}
return false;
}
@ -326,8 +277,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// This operates under the assumption that only other SGPR spills are users
// of the frame index.
lowerShiftReservedVGPR(MF, ST);
// To track the spill frame indices handled in this pass.
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
@ -375,8 +324,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
FuncInfo->removeDeadFrameIndices(MFI);
MadeChange = true;
} else if (FuncInfo->VGPRReservedForSGPRSpill) {
FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
}
SaveBlocks.clear();

View File

@ -274,7 +274,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned WaveSize = ST.getWavefrontSize();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
unsigned Size = FrameInfo.getObjectSize(FI);
unsigned NumLanes = Size / 4;
@ -291,16 +290,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
Register LaneVGPR;
unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
// Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and
// when one of the two conditions is true:
// 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet
// reserved.
// 2. All spill lanes of reserved VGPR(s) are full and another spill lane is
// required.
if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) {
assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR);
LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill;
} else if (VGPRIndex == 0) {
if (VGPRIndex == 0) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
@ -308,6 +298,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
SGPRToVGPRSpills.erase(FI);
NumVGPRSpillLanes -= I;
// FIXME: We can run out of free registers with split allocation if
// IPRA is enabled and a called function already uses every VGPR.
#if 0
DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
"VGPRs for SGPR spilling",
@ -340,21 +332,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return true;
}
/// Reserve a VGPR for spilling of SGPRs
bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
Register LaneVGPR = TRI->findUnusedRegister(
MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
if (LaneVGPR == Register())
return false;
SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None));
FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
return true;
}
/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
/// Either AGPR is spilled to VGPR to vice versa.
/// Returns true if a \p FI can be eliminated completely.
@ -616,24 +593,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
return false;
}
// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs
bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
MachineFunction &MF) {
for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) {
if (i->VGPR == ReservedVGPR) {
SpillVGPRs.erase(i);
for (MachineBasicBlock &MBB : MF) {
MBB.removeLiveIn(ReservedVGPR);
MBB.sortUniqueLiveIns();
}
this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister;
return true;
}
}
return false;
}
bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
if (UsesAGPRs)
return *UsesAGPRs;

View File

@ -502,7 +502,6 @@ public: // FIXME
Register SGPRForBPSaveRestoreCopy;
Optional<int> BasePointerSaveIndex;
Register VGPRReservedForSGPRSpill;
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
public:
@ -528,7 +527,6 @@ public:
void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
SpillVGPRs[Index].VGPR = NewVGPR;
SpillVGPRs[Index].FI = newFI;
VGPRReservedForSGPRSpill = NewVGPR;
}
bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
@ -556,7 +554,6 @@ public:
bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
bool reserveVGPRforSGPRSpills(MachineFunction &MF);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
void removeDeadFrameIndices(MachineFrameInfo &MFI);

View File

@ -520,58 +520,58 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:48
; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:64
; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:80
; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:96
; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:112
; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:128
; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:144
; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:160
; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:176
; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:192
; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:32
; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:48
; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:64
; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:80
; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:96
; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:112
; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:128
; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:144
; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:160
; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:192
; GCN-NEXT: s_add_i32 s32, s32, 0x10000
; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:208
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:224
; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
; GCN-NEXT: v_and_b32_e32 v0, 31, v2
; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33
@ -582,50 +582,50 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:288
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:292
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:296
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:300
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:304
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:308
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:312
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:316
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:320
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:324
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:328
; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:332
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:336
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:340
; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:344
; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:348
; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:352
; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:356
; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:360
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:364
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:368
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:372
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:376
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:380
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:384
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:388
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:392
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:396
; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:400
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:404
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:408
; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:412
; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416
; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420
; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424
; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:432
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:436
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:440
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:444
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:288
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:292
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:296
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:300
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:304
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:308
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:312
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:316
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:320
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:324
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:328
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:332
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:336
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:340
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:344
; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:348
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:352
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:356
; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:360
; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:364
; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:368
; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:372
; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:376
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:380
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:384
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:388
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:392
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:396
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412
; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424
; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428
; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
@ -676,10 +676,10 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:480
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:484
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:488
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:492
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504

View File

@ -5,7 +5,7 @@ define void @child_function() #0 {
ret void
}
; GCN-LABEL: {{^}}reserve_vgpr_with_no_lower_vgpr_available:
; GCN-LABEL: {{^}}spill_sgpr_with_no_lower_vgpr_available:
; GCN: buffer_store_dword v255, off, s[0:3], s32
; GCN: v_writelane_b32 v255, s33, 2
; GCN: v_writelane_b32 v255, s30, 0
@ -16,7 +16,7 @@ define void @child_function() #0 {
; GCN: v_readlane_b32 s33, v255, 2
; GCN: ; NumVgprs: 256
define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@ -51,7 +51,7 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
ret void
}
; GCN-LABEL: {{^}}reserve_lowest_available_vgpr:
; GCN-LABEL: {{^}}spill_to_lowest_available_vgpr:
; GCN: buffer_store_dword v254, off, s[0:3], s32
; GCN: v_writelane_b32 v254, s33, 2
; GCN: v_writelane_b32 v254, s30, 0
@ -61,7 +61,7 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
; GCN: v_readlane_b32 s31, v254, 1
; GCN: v_readlane_b32 s33, v254, 2
define void @reserve_lowest_available_vgpr() #0 {
define void @spill_to_lowest_available_vgpr() #0 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@ -96,14 +96,14 @@ define void @reserve_lowest_available_vgpr() #0 {
ret void
}
; GCN-LABEL: {{^}}reserve_vgpr_with_sgpr_spills:
; GCN-LABEL: {{^}}spill_sgpr_with_sgpr_uses:
; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32
; GCN: ; def s4
; GCN: v_writelane_b32 v254, s4, 2
; GCN: v_readlane_b32 s4, v254, 2
; GCN: ; use s4
define void @reserve_vgpr_with_sgpr_spills() #0 {
define void @spill_sgpr_with_sgpr_uses() #0 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@ -147,12 +147,12 @@ ret:
ret void
}
; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call
; GCN-LABEL: {{^}}spill_sgpr_with_tail_call
; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32
; GCN-NOT: v_writelane
; GCN: s_setpc_b64 s[4:5]
define void @reserve_vgpr_with_tail_call() #0 {
define void @spill_sgpr_with_tail_call() #0 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@ -187,17 +187,29 @@ define void @reserve_vgpr_with_tail_call() #0 {
ret void
}
; GCN-LABEL: {{^}}reserve_vgpr_for_sgpr_spills_no_alloca:
; GCN: v_writelane_b32 v5, s34, 0
; GCN: v_writelane_b32 v5, s35, 1
; GCN: v_writelane_b32 v5, s36, 2
; GCN: v_writelane_b32 v5, s37, 3
; GCN: v_readlane_b32 s37, v5, 3
; GCN: v_readlane_b32 s36, v5, 2
; GCN: v_readlane_b32 s35, v5, 1
; GCN: v_readlane_b32 s34, v5, 0
; Special case where all registers are explicitly clobbered in the function and
; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory.
define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr:
; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0
; GCN: buffer_store_dword [[A]], off, s[0:3], s32
; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0
; GCN: buffer_store_dword [[B]], off, s[0:3], s32
; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0
; GCN: buffer_store_dword [[C]], off, s[0:3], s32
; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0
; GCN: buffer_store_dword [[D]], off, s[0:3], s32
; GCN: #ASMEND
; GCN: buffer_load_dword [[E:v[0-9]+]]
; GCN: v_readlane_b32 s37, [[E]], 0
; GCN: buffer_load_dword [[F:v[0-9]+]]
; GCN: v_readlane_b32 s36, [[F]], 0
; GCN: buffer_load_dword [[G:v[0-9]+]]
; GCN: v_readlane_b32 s35, [[G]], 0
; GCN: buffer_load_dword [[H:v[0-9]+]]
; GCN: v_readlane_b32 s34, [[H]], 0
define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
call void asm sideeffect "",
"~{v6},~{v7},~{v8},~{v9}
@ -234,4 +246,96 @@ define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out
ret void
}
; If IPRA no-CSR optimization is enabled, we will not be able to allocate an
; SGPR for VGPR spills in the parent function since this child function uses all
; VGPRs.
define internal void @child_function_ipra() #0 {
call void asm sideeffect "",
"~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0
ret void
}
; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra:
; GCN: v_writelane_b32 v0, s30, 0
; GCN: v_writelane_b32 v0, s31, 1
; GCN: buffer_store_dword v0, off
; GCN: swappc
; GCN: buffer_load_dword v0, off
; GCN: v_readlane_b32 s30, v0, 0
; GCN: v_readlane_b32 s31, v0, 1
define void @spill_sgpr_no_free_vgpr_ipra() #0 {
call void @child_function_ipra()
ret void
}
define internal void @child_function_ipra_tail_call() #0 {
call void asm sideeffect "",
"~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0
ret void
}
; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra_tail_call:
; GCN-NOT: v_writelane_b32
; GCN-NOT: buffer_store_dword
; GCN-NOT: swappc
; GCN-NOT: buffer_load_dword v0, off
; GCN-NOT: v_readlane_b32
; GCN: setpc
define void @spill_sgpr_no_free_vgpr_ipra_tail_call() #0 {
tail call void @child_function_ipra_tail_call()
ret void
}
attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }