forked from OSchip/llvm-project
[AMDGPU] Fix saving fp and bp
Spilling the fp or bp to scratch could overwrite VGPRs of inactive lanes. Fix that by using only the active lanes of the scavenged VGPR. This builds on the assumptions that 1. a function is never called with exec=0 2. lanes do not die in a function, i.e. exec!=0 in the function epilog 3. no new lanes are active when exiting the function, i.e. exec in the epilog is a subset of exec in the prolog. Differential Revision: https://reviews.llvm.org/D96869
This commit is contained in:
parent
ca3bae94c4
commit
b76c2a6c2b
|
@ -648,6 +648,22 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
|
|||
llvm_unreachable("Invalid TargetStackID::Value");
|
||||
}
|
||||
|
||||
static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
|
||||
const SIMachineFunctionInfo *FuncInfo,
|
||||
MachineFunction &MF, MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MBBI, bool IsProlog) {
|
||||
if (LiveRegs.empty()) {
|
||||
LiveRegs.init(TRI);
|
||||
if (IsProlog) {
|
||||
LiveRegs.addLiveIns(MBB);
|
||||
} else {
|
||||
// In epilog.
|
||||
LiveRegs.addLiveOuts(MBB);
|
||||
LiveRegs.stepBackward(*MBBI);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Activate all lanes, returns saved exec.
|
||||
static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
|
||||
MachineFunction &MF,
|
||||
|
@ -659,19 +675,10 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
|
|||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
||||
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
||||
DebugLoc DL;
|
||||
|
||||
if (LiveRegs.empty()) {
|
||||
if (IsProlog) {
|
||||
LiveRegs.init(TRI);
|
||||
LiveRegs.addLiveIns(MBB);
|
||||
} else {
|
||||
// In epilog.
|
||||
LiveRegs.init(*ST.getRegisterInfo());
|
||||
LiveRegs.addLiveOuts(MBB);
|
||||
LiveRegs.stepBackward(*MBBI);
|
||||
}
|
||||
}
|
||||
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
|
||||
|
||||
ScratchExecCopy = findScratchNonCalleeSaveRegister(
|
||||
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
|
||||
|
@ -740,13 +747,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBBI, Reg.VGPR, *Reg.FI);
|
||||
}
|
||||
|
||||
if (ScratchExecCopy) {
|
||||
// FIXME: Split block and make terminator.
|
||||
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
|
||||
.addReg(ScratchExecCopy, RegState::Kill);
|
||||
LiveRegs.addReg(ScratchExecCopy);
|
||||
}
|
||||
|
||||
if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
|
||||
const int FramePtrFI = *FPSaveIndex;
|
||||
assert(!MFI.isDeadObjectIndex(FramePtrFI));
|
||||
|
||||
if (!ScratchExecCopy)
|
||||
ScratchExecCopy =
|
||||
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
|
||||
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
|
||||
|
||||
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
|
||||
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
|
||||
|
@ -764,9 +778,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
const int BasePtrFI = *BPSaveIndex;
|
||||
assert(!MFI.isDeadObjectIndex(BasePtrFI));
|
||||
|
||||
if (!ScratchExecCopy)
|
||||
ScratchExecCopy =
|
||||
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
|
||||
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
|
||||
|
||||
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
|
||||
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
|
||||
|
@ -780,15 +792,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
BasePtrFI);
|
||||
}
|
||||
|
||||
if (ScratchExecCopy) {
|
||||
// FIXME: Split block and make terminator.
|
||||
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
|
||||
.addReg(ScratchExecCopy, RegState::Kill);
|
||||
LiveRegs.addReg(ScratchExecCopy);
|
||||
}
|
||||
|
||||
// In this case, spill the FP to a reserved VGPR.
|
||||
if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
|
||||
const int FramePtrFI = *FPSaveIndex;
|
||||
|
@ -968,14 +971,11 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
.setMIFlag(MachineInstr::FrameDestroy);
|
||||
}
|
||||
|
||||
Register ScratchExecCopy;
|
||||
if (FPSaveIndex) {
|
||||
const int FramePtrFI = *FPSaveIndex;
|
||||
assert(!MFI.isDeadObjectIndex(FramePtrFI));
|
||||
if (spilledToMemory(MF, FramePtrFI)) {
|
||||
if (!ScratchExecCopy)
|
||||
ScratchExecCopy =
|
||||
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
|
||||
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
|
||||
|
||||
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
|
||||
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
|
||||
|
@ -1001,9 +1001,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
const int BasePtrFI = *BPSaveIndex;
|
||||
assert(!MFI.isDeadObjectIndex(BasePtrFI));
|
||||
if (spilledToMemory(MF, BasePtrFI)) {
|
||||
if (!ScratchExecCopy)
|
||||
ScratchExecCopy =
|
||||
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
|
||||
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
|
||||
|
||||
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
|
||||
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
|
||||
|
@ -1025,6 +1023,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
}
|
||||
}
|
||||
|
||||
Register ScratchExecCopy;
|
||||
for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
|
||||
FuncInfo->getSGPRSpillVGPRs()) {
|
||||
if (!Reg.FI.hasValue())
|
||||
|
|
|
@ -563,18 +563,14 @@ define void @ipra_call_with_stack() #0 {
|
|||
|
||||
; With no free registers, we must spill the FP to memory.
|
||||
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
|
||||
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; FLATSCR: s_mov_b32 s0, s33
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
|
||||
; FLATSCR: s_mov_b32 s33, s0
|
||||
; MUBUF: s_waitcnt vmcnt(0)
|
||||
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
|
||||
; GCN: s_setpc_b64
|
||||
; MUBUF: ScratchSize: 8
|
||||
; FLATSCR: ScratchSize: 0
|
||||
|
@ -598,16 +594,16 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
|
|||
; VGPR.
|
||||
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
|
||||
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; GCN-NOT: v_writelane_b32 v40, s33
|
||||
; MUBUF: s_mov_b32 s33, s32
|
||||
; FLATSCR: s_mov_b32 s33, s0
|
||||
; GCN-NOT: v_readlane_b32 s33, v40
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
|
||||
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
|
||||
; GCN: s_setpc_b64
|
||||
define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
|
||||
|
@ -672,10 +668,10 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
|
|||
; MUBUF: s_or_saveexec_b64 s[4:5], -1
|
||||
; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
|
||||
; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, s33
|
||||
; MUBUF: v_mov_b32_e32 v0, s33
|
||||
; GCN-NOT: v_mov_b32_e32 v0, 0x100c
|
||||
; MUBUF-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
|
||||
; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004
|
||||
; FLATSCR: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
|
||||
|
|
|
@ -38,10 +38,8 @@ define void @callee_with_stack_and_call() #0 {
|
|||
; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call:
|
||||
; NO-SPILL-TO-VGPR: ; %bb.0:
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, s33
|
||||
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec
|
||||
|
@ -60,7 +58,7 @@ define void @callee_with_stack_and_call() #0 {
|
|||
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3
|
||||
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16
|
||||
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
|
@ -69,13 +67,11 @@ define void @callee_with_stack_and_call() #0 {
|
|||
; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v2, 1
|
||||
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
|
||||
; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s33, v0
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; NO-SPILL-TO-VGPR-NEXT: s_setpc_b64 s[4:5]
|
||||
%alloca = alloca i32, addrspace(5)
|
||||
store volatile i32 0, i32 addrspace(5)* %alloca
|
||||
|
|
|
@ -294,6 +294,7 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
|
|||
; GCN: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: s_add_u32 s6, s32, 0x42100
|
||||
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s6 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s33
|
||||
; GCN-NOT: v_mov_b32_e32 v0, 0x1088
|
||||
; GCN-NEXT: s_add_u32 s6, s32, 0x42200
|
||||
|
@ -301,6 +302,7 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
|
|||
; GCN-NEXT: v_mov_b32_e32 v0, s34
|
||||
; GCN-NOT: v_mov_b32_e32 v0, 0x108c
|
||||
; GCN-NEXT: s_add_u32 s6, s32, 0x42300
|
||||
; GCN-NEXT: s_mov_b32 s34, s32
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
|
||||
%local_val = alloca i32, align 128, addrspace(5)
|
||||
store volatile i32 %b, i32 addrspace(5)* %local_val, align 128
|
||||
|
|
Loading…
Reference in New Issue