AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831
This commit is contained in:
Matt Arsenault 2018-03-29 21:30:06 +00:00
parent 50635dab26
commit 03ae399d50
4 changed files with 210 additions and 8 deletions

View File

@ -587,6 +587,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();
if (MFI->isStackRealigned())
Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||

View File

@ -13,6 +13,7 @@
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@ -487,9 +488,43 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
}
}
// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer. We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
// shrink wrapping), but then no longer be free when this is called from
// emitPrologue.
//
// FIXME: This is a bit conservative, since in the above case we could use one
// of the callee-save registers as a scratch temp to re-align the stack pointer,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
MachineFunction *MF = MBB.getParent();
const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
LivePhysRegs LiveRegs(TRI);
LiveRegs.addLiveIns(MBB);
// Mark callee saved registers as used so we will not choose them.
const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
for (unsigned i = 0; CSRegs[i]; ++i)
LiveRegs.addReg(CSRegs[i]);
MachineRegisterInfo &MRI = MF->getRegInfo();
for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
if (LiveRegs.available(MRI, Reg))
return Reg;
}
return AMDGPU::NoRegister;
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
if (FuncInfo->isEntryFunction()) {
emitEntryFunctionPrologue(MF, MBB);
return;
@ -498,6 +533,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const MachineFrameInfo &MFI = MF.getFrameInfo();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@ -505,8 +541,36 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
// XXX - Is this the right predicate?
bool NeedFP = hasFP(MF);
if (NeedFP) {
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
const bool NeedsRealignment = TRI.needsStackRealignment(MF);
if (NeedsRealignment) {
assert(NeedFP);
const unsigned Alignment = MFI.getMaxAlignment();
const unsigned ZeroLowBits = countTrailingZeros(Alignment);
assert(ZeroLowBits > 1);
RoundedSize += Alignment;
unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
assert(ScratchSPReg != AMDGPU::NoRegister);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
.addReg(StackPtrReg)
.addImm((Alignment - 1) * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
.addReg(ScratchSPReg, RegState::Kill)
.addImm(-Alignment * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
} else if (NeedFP) {
// If we need a base pointer, set it up here. It's whatever the value of
// the stack pointer is at this point. Any variable size objects will be
// allocated after this, so we can still use the base pointer to reference
@ -516,11 +580,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
uint32_t NumBytes = MFI.getStackSize();
if (NumBytes != 0 && hasSP(MF)) {
if (RoundedSize != 0 && hasSP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(NumBytes * ST.getWavefrontSize())
.addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
@ -566,10 +629,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
// it's really whether we need SP to be accurate or not.
if (NumBytes != 0 && hasSP(MF)) {
uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
NumBytes + MFI.getMaxAlignment() : NumBytes;
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(NumBytes * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameDestroy);
.addImm(RoundedSize * ST.getWavefrontSize());
}
}
@ -759,7 +824,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
}
bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
// All stack operations are relative to the frame offset SGPR.
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MFI.hasCalls() || MFI.hasVarSizedObjects();
return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
}

View File

@ -142,6 +142,7 @@ private:
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
bool IsStackRealigned = false;
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
@ -495,6 +496,14 @@ public:
HasNonSpillStackObjects = StackObject;
}
bool isStackRealigned() const {
return IsStackRealigned;
}
void setIsStackRealigned(bool Realigned = true) {
IsStackRealigned = Realigned;
}
unsigned getNumSpilledSGPRs() const {
return NumSpilledSGPRs;
}

View File

@ -0,0 +1,125 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Check that we properly realign the stack. While 4-byte access is all
; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g.
; 128 byte object
; 4 byte emergency stack slot
; = 144 bytes with padding between them
; GCN-LABEL: {{^}}needs_align16_default_stack_align:
; GCN: s_mov_b32 s5, s32
; GCN-NOT: s32
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN-NOT: s32
; GCN: ; ScratchSize: 144
define void @needs_align16_default_stack_align(i32 %idx) #0 {
%alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
%gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
ret void
}
; GCN-LABEL: {{^}}needs_align16_stack_align4:
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
; GCN: s_and_b32 s5, s6, 0xfffffc00
; GCN: s_add_u32 s32, s32, 0x2800{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: s_sub_u32 s32, s32, 0x2800
; GCN: ; ScratchSize: 160
define void @needs_align16_stack_align4(i32 %idx) #2 {
%alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
%gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
ret void
}
; GCN-LABEL: {{^}}needs_align32:
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
; GCN: s_and_b32 s5, s6, 0xfffff800
; GCN: s_add_u32 s32, s32, 0x3000{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: s_sub_u32 s32, s32, 0x3000
; GCN: ; ScratchSize: 192
define void @needs_align32(i32 %idx) #0 {
%alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5)
%gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 32
ret void
}
; GCN-LABEL: {{^}}force_realign4:
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
; GCN: s_and_b32 s5, s6, 0xffffff00
; GCN: s_add_u32 s32, s32, 0xd00{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
; GCN: s_sub_u32 s32, s32, 0xd00
; GCN: ; ScratchSize: 52
define void @force_realign4(i32 %idx) #1 {
%alloca.align16 = alloca [8 x i32], align 4, addrspace(5)
%gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx
store volatile i32 3, i32 addrspace(5)* %gep0, align 4
ret void
}
; GCN-LABEL: {{^}}kernel_call_align16_from_8:
; GCN: s_add_u32 s32, s8, 0x400{{$}}
; GCN-NOT: s32
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 2, i32 addrspace(5)* %alloca
call void @needs_align16_default_stack_align(i32 1)
ret void
}
; The call sequence should keep the stack on call aligned to 4
; GCN-LABEL: {{^}}kernel_call_align16_from_5:
; GCN: s_add_u32 s32, s8, 0x400
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_align16_from_5() {
%alloca0 = alloca i8, align 1, addrspace(5)
store volatile i8 2, i8 addrspace(5)* %alloca0
call void @needs_align16_default_stack_align(i32 1)
ret void
}
; GCN-LABEL: {{^}}kernel_call_align4_from_5:
; GCN: s_add_u32 s32, s8, 0x400
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_align4_from_5() {
%alloca0 = alloca i8, align 1, addrspace(5)
store volatile i8 2, i8 addrspace(5)* %alloca0
call void @needs_align16_stack_align4(i32 1)
ret void
}
attributes #0 = { noinline nounwind }
attributes #1 = { noinline nounwind "stackrealign" }
attributes #2 = { noinline nounwind alignstack=4 }