For PAL, make sure Scratch Buffer Descriptor do not clobber GIT pointer

Since SRSRC has alignment requirements, first find non GIT pointer clobbered
registers for SRSRC and then if those registers clobber preloaded Scratch Wave
Offset register, copy the Scratch Wave Offset register to a free SGPR.
This commit is contained in:
Ram Nalamothu 2020-04-30 18:25:24 +00:00 committed by Matt Arsenault
parent 2f1fe1864d
commit f7060f4f88
6 changed files with 142 additions and 63 deletions

View File

@ -30,6 +30,11 @@ static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
ST.getMaxNumSGPRs(MF) / 4);
}
static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
}
// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer. We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
@ -257,7 +262,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Shift down registers reserved for the scratch RSRC.
Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
MachineFunction &MF, Register ScratchWaveOffsetReg) const {
MachineFunction &MF) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@ -269,9 +274,8 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
Register ScratchRsrcReg = MFI->getScratchRSrcReg();
if (ScratchRsrcReg == AMDGPU::NoRegister ||
!MRI.isPhysRegUsed(ScratchRsrcReg))
return AMDGPU::NoRegister;
if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
return Register();
if (ST.hasSGPRInitBug() ||
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
@ -292,17 +296,13 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
// Skip the last N reserved elements because they should have already been
// reserved for VCC etc.
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPR128s) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
//
// FIXME: The preloaded SGPR count is not accurate for shaders as the
// scratch wave offset may be in a fixed SGPR or
// SITargetLowering::allocateSystemSGPRs may choose some free SGPR for the
// scratch wave offset. We explicitly avoid the scratch wave offset to
// account for this.
// reserved input we needed. Also for PAL, make sure we don't clobber
// the GIT pointer passed in SGPR0 or SGPR8.
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
!TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) {
!TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
@ -330,28 +330,28 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
assert(MFI->isEntryFunction());
Register ScratchWaveOffsetReg = MFI->getPreloadedReg(
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
// FIXME: Hack to not crash in situations which emitted an error.
if (ScratchWaveOffsetReg == AMDGPU::NoRegister)
if (!PreloadedScratchWaveOffsetReg)
return;
// We need to do the replacement of the private segment buffer register even
// if there are no stack objects. There could be stores to undef or a
// constant without an associated object.
//
// This will return `AMDGPU::NoRegister` in cases where there are no actual
// This will return `Register()` in cases where there are no actual
// uses of the SRSRC.
Register ScratchRsrcReg =
getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg);
Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
// Make the selected register live throughout the function.
if (ScratchRsrcReg != AMDGPU::NoRegister) {
if (ScratchRsrcReg) {
for (MachineBasicBlock &OtherBB : MF) {
if (&OtherBB != &MBB) {
OtherBB.addLiveIn(ScratchRsrcReg);
@ -361,12 +361,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg = AMDGPU::NoRegister;
Register PreloadedScratchRsrcReg;
if (ST.isAmdHsaOrMesa(F)) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg != AMDGPU::NoRegister &&
PreloadedScratchRsrcReg != AMDGPU::NoRegister) {
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
// We added live-ins during argument lowering, but since they were not
// used they were deleted. We're adding the uses now, so add them back.
MRI.addLiveIn(PreloadedScratchRsrcReg);
@ -379,6 +378,32 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();
// We found the SRSRC first because it needs four registers and has an
// alignment requirement. If the SRSRC that we found is clobbering with
// the scratch wave offset, which may be in a fixed SGPR or a free SGPR
// chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
// wave offset to a free SGPR.
Register ScratchWaveOffsetReg;
if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
AllSGPRs = AllSGPRs.slice(
std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPRs) {
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
!TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
ScratchWaveOffsetReg = Reg;
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
break;
}
}
} else {
ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
}
assert(ScratchWaveOffsetReg);
if (MF.getFrameInfo().hasCalls()) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
@ -392,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) {
MRI.addLiveIn(ScratchWaveOffsetReg);
MBB.addLiveIn(ScratchWaveOffsetReg);
if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
if (MFI->hasFlatScratchInit()) {
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
if (ScratchRsrcReg != AMDGPU::NoRegister) {
if (ScratchRsrcReg) {
emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
PreloadedScratchRsrcReg,
ScratchRsrcReg, ScratchWaveOffsetReg);
@ -437,19 +462,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
}
auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
if (ST.hasMergedShaders()) {
switch (MF.getFunction().getCallingConv()) {
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
// Low GIT address is passed in s8 rather than s0 for an LS+HS or
// ES+GS merged shader on gfx9+.
GitPtrLo = AMDGPU::SGPR8;
break;
default:
break;
}
}
Register GitPtrLo = MFI->getGITPtrLoReg(MF);
MF.getRegInfo().addLiveIn(GitPtrLo);
MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
@ -475,8 +488,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(0) // dlc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
} else if (ST.isMesaGfxShader(Fn) ||
(PreloadedScratchRsrcReg == AMDGPU::NoRegister)) {
} else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@ -537,7 +549,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister);
assert(PreloadedScratchRsrcReg);
if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
@ -650,7 +662,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Reg.FI.getValue());
}
if (ScratchExecCopy != AMDGPU::NoRegister) {
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@ -659,7 +671,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.addReg(ScratchExecCopy);
}
if (FuncInfo->FramePointerSaveIndex) {
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
assert(!MFI.isDeadObjectIndex(FI) &&
@ -690,8 +701,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Register ScratchSPReg = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
assert(ScratchSPReg != AMDGPU::NoRegister &&
ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
@ -785,7 +795,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
continue;
const SIRegisterInfo &TRI = TII->getRegisterInfo();
if (ScratchExecCopy == AMDGPU::NoRegister) {
if (!ScratchExecCopy) {
// See emitPrologue
if (LiveRegs.empty()) {
LiveRegs.init(*ST.getRegisterInfo());
@ -809,7 +819,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
}
if (ScratchExecCopy != AMDGPU::NoRegister) {
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@ -991,7 +1001,7 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
for (auto &CS : CSI) {
if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
if (FuncInfo->SGPRForFPSaveRestoreCopy)
CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
break;
}

View File

@ -61,9 +61,7 @@ private:
const DebugLoc &DL,
Register ScratchWaveOffsetReg) const;
Register
getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF,
Register ScratchWaveOffsetReg) const;
Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,

View File

@ -439,6 +439,27 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}
Register
SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.isAmdPalOS())
return Register();
Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
if (ST.hasMergedShaders()) {
switch (MF.getFunction().getCallingConv()) {
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
// Low GIT address is passed in s8 rather than s0 for an LS+HS or
// ES+GS merged shader on gfx9+.
GitPtrLo = AMDGPU::SGPR8;
return GitPtrLo;
default:
return GitPtrLo;
}
}
return GitPtrLo;
}
static yaml::StringValue regToString(Register Reg,
const TargetRegisterInfo &TRI) {
yaml::StringValue Dest;

View File

@ -676,6 +676,8 @@ public:
return GITPtrHigh;
}
Register getGITPtrLoReg(const MachineFunction &MF) const;
uint32_t get32BitAddressHighBits() const {
return HighBitsOf32BitAddress;
}

View File

@ -0,0 +1,48 @@
# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -run-pass=prologepilog -o - %s | FileCheck %s
# On PAL, we need to ensure SRSRC do not clobber GIT pointer, passed
# in SGPR8 for HS or GS
--- |
define amdgpu_gs void @shader(i32 inreg %mergedGroupInfo) {
ret void
}
...
---
name: shader
tracksRegLiveness: true
liveins:
- { reg: '$sgpr0' }
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr100_sgpr101_sgpr102_sgpr103'
stackPtrOffsetReg: '$sgpr32'
argumentInfo:
privateSegmentWaveByteOffset: { reg: '$sgpr5' }
body: |
; CHECK: $sgpr1 = COPY killed $sgpr5
; CHECK: $sgpr4_sgpr5 = S_GETPC_B64
; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4)
bb.0:
successors: %bb.1, %bb.2
liveins: $sgpr0
$exec_lo = S_MOV_B32 -1
renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
renamable $sgpr0 = S_BFE_U32 killed renamable $sgpr0, 589836, implicit-def dead $scc
renamable $vcc_lo = V_CMP_GT_U32_e64 killed $sgpr0, killed $vgpr0, implicit $exec
$vcc_hi = IMPLICIT_DEF
$sgpr0 = S_AND_SAVEEXEC_B32 $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
S_CBRANCH_EXECZ %bb.2, implicit $exec
S_BRANCH %bb.1
bb.1:
renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
bb.2:
S_ENDPGM 0
...

View File

@ -14,14 +14,14 @@
;
; GCN-LABEL: {{^}}ps_main:
; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s6, -1
; SI-DAG: s_mov_b32 s7, 0xe8f000
; VI-DAG: s_mov_b32 s7, 0xe80000
; GFX9-DAG: s_mov_b32 s7, 0xe00000
; GFX10_W32-DAG: s_mov_b32 s7, 0x31c16000
; GFX10_W64-DAG: s_mov_b32 s7, 0x31e16000
; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s2, -1
; SI-DAG: s_mov_b32 s3, 0xe8f000
; VI-DAG: s_mov_b32 s3, 0xe80000
; GFX9-DAG: s_mov_b32 s3, 0xe00000
; GFX10_W32-DAG: s_mov_b32 s3, 0x31c16000
; GFX10_W64-DAG: s_mov_b32 s3, 0x31e16000
; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
; GCN-NOT: s_mov_b32 s0
@ -39,7 +39,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
}
; GCN-LABEL: {{^}}vs_main:
; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s0
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
@ -51,7 +51,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
}
; GCN-LABEL: {{^}}cs_main:
; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
define amdgpu_cs float @cs_main(i32 %idx) {
@ -62,7 +62,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
}
; GCN-LABEL: {{^}}hs_main:
; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; SIVI-NOT: s_mov_b32 s0
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
@ -79,7 +79,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
}
; GCN-LABEL: {{^}}gs_main:
; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen