AMDGPU: Support realigning stack

While the stack access instructions don't care about alignment > 4, some transformations on the pointer calculation do make assumptions based on knowing the low bits of a pointer are 0. If a stack object ends up being accessed through its absolute address (relative to the kernel scratch wave offset), the addressing expression may depend on the stack frame being properly aligned. This was breaking in a testcase due to the add->or combine. I think some of the SP/FP handling logic is still backwards, and overly simplistic to support all of the stack features. Code which tries to modify the SP with inline asm for example or variable sized objects will probably require redoing this. llvm-svn: 328831
2018-03-29 21:30:06 +00:00 · 2018-03-29 21:30:06 +00:00 · 03ae399d50
parent 50635dab26
commit 03ae399d50
4 changed files with 210 additions and 8 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -587,6 +587,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(

  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
  Info.PrivateSegmentSize = FrameInfo.getStackSize();
+  if (MFI->isStackRealigned())
+    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();


  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -13,6 +13,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"

+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@ -487,9 +488,43 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
  }
 }

+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+  MachineFunction *MF = MBB.getParent();
+
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
+  LiveRegs.addLiveIns(MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                   MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
  if (FuncInfo->isEntryFunction()) {
    emitEntryFunctionPrologue(MF, MBB);
    return;
@ -498,6 +533,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
  const MachineFrameInfo &MFI = MF.getFrameInfo();
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@ -505,8 +541,36 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
  MachineBasicBlock::iterator MBBI = MBB.begin();
  DebugLoc DL;

+  // XXX - Is this the right predicate?
+
  bool NeedFP = hasFP(MF);
-  if (NeedFP) {
+  uint32_t NumBytes = MFI.getStackSize();
+  uint32_t RoundedSize = NumBytes;
+  const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+  if (NeedsRealignment) {
+    assert(NeedFP);
+    const unsigned Alignment = MFI.getMaxAlignment();
+    const unsigned ZeroLowBits = countTrailingZeros(Alignment);
+    assert(ZeroLowBits > 1);
+
+    RoundedSize += Alignment;
+
+    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+    assert(ScratchSPReg != AMDGPU::NoRegister);
+
+    // s_add_u32 tmp_reg, s32, NumBytes
+    // s_and_b32 s32, tmp_reg, 0b111...0000
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+      .addReg(StackPtrReg)
+      .addImm((Alignment - 1) * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+      .addReg(ScratchSPReg, RegState::Kill)
+      .addImm(-Alignment * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    FuncInfo->setIsStackRealigned(true);
+  } else if (NeedFP) {
    // If we need a base pointer, set it up here. It's whatever the value of
    // the stack pointer is at this point. Any variable size objects will be
    // allocated after this, so we can still use the base pointer to reference
@ -516,11 +580,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
      .setMIFlag(MachineInstr::FrameSetup);
  }

-  uint32_t NumBytes = MFI.getStackSize();
-  if (NumBytes != 0 && hasSP(MF)) {
+  if (RoundedSize != 0 && hasSP(MF)) {
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
      .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
+      .addImm(RoundedSize * ST.getWavefrontSize())
      .setMIFlag(MachineInstr::FrameSetup);
  }

@ -566,10 +629,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
  // it's really whether we need SP to be accurate or not.

  if (NumBytes != 0 && hasSP(MF)) {
+    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+      NumBytes + MFI.getMaxAlignment() : NumBytes;
+
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
      .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameDestroy);
+      .addImm(RoundedSize * ST.getWavefrontSize());
  }
 }

@ -759,7 +824,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
 }

 bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
  // All stack operations are relative to the frame offset SGPR.
  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
 }
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -142,6 +142,7 @@ private:
  bool HasSpilledSGPRs = false;
  bool HasSpilledVGPRs = false;
  bool HasNonSpillStackObjects = false;
+  bool IsStackRealigned = false;

  unsigned NumSpilledSGPRs = 0;
  unsigned NumSpilledVGPRs = 0;
@ -495,6 +496,14 @@ public:
    HasNonSpillStackObjects = StackObject;
  }

+  bool isStackRealigned() const {
+    return IsStackRealigned;
+  }
+
+  void setIsStackRealigned(bool Realigned = true) {
+    IsStackRealigned = Realigned;
+  }
+
  unsigned getNumSpilledSGPRs() const {
    return NumSpilledSGPRs;
  }
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that we properly realign the stack. While 4-byte access is all
+; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g.
+
+
+; 128 byte object
+; 4 byte emergency stack slot
+; = 144 bytes with padding between them
+
+; GCN-LABEL: {{^}}needs_align16_default_stack_align:
+; GCN: s_mov_b32 s5, s32
+; GCN-NOT: s32
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN-NOT: s32
+
+; GCN: ; ScratchSize: 144
+define void @needs_align16_default_stack_align(i32 %idx) #0 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}needs_align16_stack_align4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffffc00
+; GCN: s_add_u32 s32, s32, 0x2800{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x2800
+
+; GCN: ; ScratchSize: 160
+define void @needs_align16_stack_align4(i32 %idx) #2 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}needs_align32:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffff800
+; GCN: s_add_u32 s32, s32, 0x3000{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x3000
+
+; GCN: ; ScratchSize: 192
+define void @needs_align32(i32 %idx) #0 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 32
+  ret void
+}
+
+; GCN-LABEL: {{^}}force_realign4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
+; GCN: s_and_b32 s5, s6, 0xffffff00
+; GCN: s_add_u32 s32, s32, 0xd00{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: s_sub_u32 s32, s32, 0xd00
+
+; GCN: ; ScratchSize: 52
+define void @force_realign4(i32 %idx) #1 {
+  %alloca.align16 = alloca [8 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile i32 3, i32 addrspace(5)* %gep0, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align16_from_8:
+; GCN: s_add_u32 s32, s8, 0x400{{$}}
+; GCN-NOT: s32
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 2, i32 addrspace(5)* %alloca
+  call void @needs_align16_default_stack_align(i32 1)
+  ret void
+}
+
+; The call sequence should keep the stack on call aligned to 4
+; GCN-LABEL: {{^}}kernel_call_align16_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_5() {
+  %alloca0 = alloca i8, align 1, addrspace(5)
+  store volatile i8 2, i8  addrspace(5)* %alloca0
+
+  call void @needs_align16_default_stack_align(i32 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align4_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align4_from_5() {
+  %alloca0 = alloca i8, align 1, addrspace(5)
+  store volatile i8 2, i8  addrspace(5)* %alloca0
+
+  call void @needs_align16_stack_align4(i32 1)
+  ret void
+}
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { noinline nounwind "stackrealign" }
+attributes #2 = { noinline nounwind alignstack=4 }