[AMDGPU] Fix saving fp and bp

Spilling the fp or bp to scratch could overwrite VGPRs of inactive
lanes. Fix that by using only the active lanes of the scavenged VGPR.

This builds on the assumptions that
1. a function is never called with exec=0
2. lanes do not die in a function, i.e. exec!=0 in the function epilog
3. no new lanes are active when exiting the function, i.e. exec in the
   epilog is a subset of exec in the prolog.

Differential Revision: https://reviews.llvm.org/D96869
This commit is contained in:
Sebastian Neubauer 2021-04-12 11:47:16 +02:00
parent ca3bae94c4
commit b76c2a6c2b
4 changed files with 40 additions and 47 deletions

View File

@ -648,6 +648,22 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
llvm_unreachable("Invalid TargetStackID::Value");
}
static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
const SIMachineFunctionInfo *FuncInfo,
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, bool IsProlog) {
if (LiveRegs.empty()) {
LiveRegs.init(TRI);
if (IsProlog) {
LiveRegs.addLiveIns(MBB);
} else {
// In epilog.
LiveRegs.addLiveOuts(MBB);
LiveRegs.stepBackward(*MBBI);
}
}
}
// Activate all lanes, returns saved exec.
static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
MachineFunction &MF,
@ -659,19 +675,10 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
DebugLoc DL;
if (LiveRegs.empty()) {
if (IsProlog) {
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
} else {
// In epilog.
LiveRegs.init(*ST.getRegisterInfo());
LiveRegs.addLiveOuts(MBB);
LiveRegs.stepBackward(*MBBI);
}
}
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
ScratchExecCopy = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
@ -740,13 +747,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, Reg.VGPR, *Reg.FI);
}
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
.addReg(ScratchExecCopy, RegState::Kill);
LiveRegs.addReg(ScratchExecCopy);
}
if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
const int FramePtrFI = *FPSaveIndex;
assert(!MFI.isDeadObjectIndex(FramePtrFI));
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
@ -764,9 +778,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const int BasePtrFI = *BPSaveIndex;
assert(!MFI.isDeadObjectIndex(BasePtrFI));
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
@ -780,15 +792,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
BasePtrFI);
}
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
.addReg(ScratchExecCopy, RegState::Kill);
LiveRegs.addReg(ScratchExecCopy);
}
// In this case, spill the FP to a reserved VGPR.
if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
const int FramePtrFI = *FPSaveIndex;
@ -968,14 +971,11 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameDestroy);
}
Register ScratchExecCopy;
if (FPSaveIndex) {
const int FramePtrFI = *FPSaveIndex;
assert(!MFI.isDeadObjectIndex(FramePtrFI));
if (spilledToMemory(MF, FramePtrFI)) {
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
@ -1001,9 +1001,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const int BasePtrFI = *BPSaveIndex;
assert(!MFI.isDeadObjectIndex(BasePtrFI));
if (spilledToMemory(MF, BasePtrFI)) {
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
@ -1025,6 +1023,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
}
}
Register ScratchExecCopy;
for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())

View File

@ -563,18 +563,14 @@ define void @ipra_call_with_stack() #0 {
; With no free registers, we must spill the FP to memory.
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
; FLATSCR: s_mov_b32 s0, s33
; GCN: s_mov_b32 s33, s32
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
; FLATSCR: s_mov_b32 s33, s0
; MUBUF: s_waitcnt vmcnt(0)
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
; GCN: s_setpc_b64
; MUBUF: ScratchSize: 8
; FLATSCR: ScratchSize: 0
@ -598,16 +594,16 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
; VGPR.
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NOT: v_writelane_b32 v40, s33
; MUBUF: s_mov_b32 s33, s32
; FLATSCR: s_mov_b32 s33, s0
; GCN-NOT: v_readlane_b32 s33, v40
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
; GCN: s_setpc_b64
define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
@ -672,10 +668,10 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
; MUBUF: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
; MUBUF-NEXT: v_mov_b32_e32 v0, s33
; MUBUF: v_mov_b32_e32 v0, s33
; GCN-NOT: v_mov_b32_e32 v0, 0x100c
; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004
; FLATSCR: v_mov_b32_e32 v0, 0
; FLATSCR: scratch_store_dword off, v0, [[SOFF]]

View File

@ -38,10 +38,8 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call:
; NO-SPILL-TO-VGPR: ; %bb.0:
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1
; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32
; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s32, s32, 0x800
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec
@ -60,7 +58,7 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@ -69,13 +67,11 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v2, 1
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9]
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7]
; NO-SPILL-TO-VGPR-NEXT: s_sub_u32 s32, s32, 0x800
; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7]
; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[4:5]
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca

View File

@ -294,6 +294,7 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
; GCN: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_add_u32 s6, s32, 0x42100
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, s33
; GCN-NOT: v_mov_b32_e32 v0, 0x1088
; GCN-NEXT: s_add_u32 s6, s32, 0x42200
@ -301,6 +302,7 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
; GCN-NEXT: v_mov_b32_e32 v0, s34
; GCN-NOT: v_mov_b32_e32 v0, 0x108c
; GCN-NEXT: s_add_u32 s6, s32, 0x42300
; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
%local_val = alloca i32, align 128, addrspace(5)
store volatile i32 %b, i32 addrspace(5)* %local_val, align 128