AMDGPU: Don't add emergency stack slot if all spills are SGPR->VGPR

This should avoid reporting any stack needs to be allocated in the case where no stack is truly used. An unused stack slot is still left around in other cases where there are real stack objects but no spilling occurs. llvm-svn: 295891
2017-02-22 22:23:32 +00:00 · 2017-02-22 22:23:32 +00:00 · 7b6c5d28f5
parent 639d7b68d6
commit 7b6c5d28f5
2 changed files with 57 additions and 39 deletions
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -383,6 +383,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 }
 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
       I != E; ++I) {
    if (!MFI.isDeadObjectIndex(I))
      return false;
  }
  return true;
 }
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
  MachineFunction &MF,
  RegScavenger *RS) const {
@ -391,35 +401,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
  if (!MFI.hasStackObjects())
    return;
  bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
  if (MayNeedScavengingEmergencySlot) {
    // We force this to be at offset 0 so no user object ever has 0 as an
    // address, so we may use 0 as an invalid pointer value. This is because
    // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
    // is required to be address space 0, we are forced to accept this for
    // now. Ideally we could have the stack in another address space with 0 as a
    // valid pointer, and -1 as the null value.
    //
    // This will also waste additional space when user stack objects require > 4
    // byte alignment.
    //
    // The main cost here is losing the offset for addressing modes. However
    // this also ensures we shouldn't need a register for the offset when
    // emergency scavenging.
    int ScavengeFI = MFI.CreateFixedObject(
      AMDGPU::SGPR_32RegClass.getSize(), 0, false);
    RS->addScavengingFrameIndex(ScavengeFI);
  }
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
  if (!TRI.spillSGPRToVGPR())
    return;
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  if (!FuncInfo->hasSpilledSGPRs())
+  bool AllSGPRSpilledToVGPRs = false;
-    return;
+
  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
    AllSGPRSpilledToVGPRs = true;
    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
    // are spilled to VGPRs, in which case we can eliminate the stack usage.
@ -437,8 +426,12 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
        if (TII->isSGPRSpill(MI)) {
          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
-        if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI))
+          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
-          TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
+            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
            (void)Spilled;
            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
          } else
            AllSGPRSpilledToVGPRs = false;
        }
      }
    }
@ -446,6 +439,32 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
  }
  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
  // but currently hasNonSpillStackObjects is set only from source
  // allocas. Stack temps produced from legalization are not counted currently.
  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
    assert(RS && "RegScavenger required if spilling");
    // We force this to be at offset 0 so no user object ever has 0 as an
    // address, so we may use 0 as an invalid pointer value. This is because
    // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
    // is required to be address space 0, we are forced to accept this for
    // now. Ideally we could have the stack in another address space with 0 as a
    // valid pointer, and -1 as the null value.
    //
    // This will also waste additional space when user stack objects require > 4
    // byte alignment.
    //
    // The main cost here is losing the offset for addressing modes. However
    // this also ensures we shouldn't need a register for the offset when
    // emergency scavenging.
    int ScavengeFI = MFI.CreateFixedObject(
      AMDGPU::SGPR_32RegClass.getSize(), 0, false);
    RS->addScavengingFrameIndex(ScavengeFI);
  }
 }
 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
--- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@ -18,13 +18,12 @@
 ; GCN: s_mov_b32 m0
 ; Make sure scratch space isn't being used for SGPR->VGPR spills
 ; FIXME: Seem to be leaving behind unused emergency slot.
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; GCN-NOT: s_buffer_load_dword m0
 ; GCN: s_endpgm
-; TOVGPR: ScratchSize: 4{{$}}
+; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
@ -768,7 +767,7 @@ ENDIF66:                                          ; preds = %LOOP65
 ; GCN-LABEL: {{^}}main1:
 ; GCN: s_endpgm
-; TOVGPR: ScratchSize: 4{{$}}
+; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0