llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

//===----------------------- SIFrameLowering.cpp --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//

#include "SIFrameLowering.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"

using namespace llvm;


static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
                                         const MachineFunction &MF) {
  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
                      ST.getMaxNumSGPRs(MF) / 4);
}

static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
                                       const MachineFunction &MF) {
  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
                      ST.getMaxNumSGPRs(MF));
}

void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
                                          MachineFunction &MF,
                                          MachineBasicBlock &MBB) const {
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  // We don't need this if we only have spills since there is no user facing
  // scratch.

  // TODO: If we know we don't have flat instructions earlier, we can omit
  // this from the input registers.
  //
  // TODO: We only need to know if we access scratch space through a flat
  // pointer. Because we only detect if flat instructions are used at all,
  // this will be used more often than necessary on VI.

  // Debug location must be unknown since the first debug location is used to
  // determine the end of the prologue.
  DebugLoc DL;
  MachineBasicBlock::iterator I = MBB.begin();

  unsigned FlatScratchInitReg
    = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);

  MachineRegisterInfo &MRI = MF.getRegInfo();
  MRI.addLiveIn(FlatScratchInitReg);
  MBB.addLiveIn(FlatScratchInitReg);

  unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);

  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();

  // Do a 64-bit pointer add.
  if (ST.flatScratchIsPointer()) {
    if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
        .addReg(FlatScrInitLo)
        .addReg(ScratchWaveOffsetReg);
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
        .addReg(FlatScrInitHi)
        .addImm(0);
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
        addReg(FlatScrInitLo).
        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
        addReg(FlatScrInitHi).
        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
      return;
    }

    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
      .addReg(FlatScrInitLo)
      .addReg(ScratchWaveOffsetReg);
    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
      .addReg(FlatScrInitHi)
      .addImm(0);

    return;
  }

  assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);

  // Copy the size in bytes.
  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
    .addReg(FlatScrInitHi, RegState::Kill);

  // Add wave offset in bytes to private base offset.
  // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
    .addReg(FlatScrInitLo)
    .addReg(ScratchWaveOffsetReg);

  // Convert offset to 256-byte units.
  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
    .addReg(FlatScrInitLo, RegState::Kill)
    .addImm(8);
}

unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
  const GCNSubtarget &ST,
  const SIInstrInfo *TII,
  const SIRegisterInfo *TRI,
  SIMachineFunctionInfo *MFI,
  MachineFunction &MF) const {
  MachineRegisterInfo &MRI = MF.getRegInfo();

  // We need to insert initialization of the scratch resource descriptor.
  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
  if (ScratchRsrcReg == AMDGPU::NoRegister ||
      !MRI.isPhysRegUsed(ScratchRsrcReg))
    return AMDGPU::NoRegister;

  if (ST.hasSGPRInitBug() ||
      ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
    return ScratchRsrcReg;

  // We reserved the last registers for this. Shift it down to the end of those
  // which were actually used.
  //
  // FIXME: It might be safer to use a pseudoregister before replacement.

  // FIXME: We should be able to eliminate unused input registers. We only
  // cannot do this for the resources required for scratch access. For now we
  // skip over user SGPRs and may leave unused holes.

  // We find the resource first because it has an alignment requirement.

  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));

  // Skip the last N reserved elements because they should have already been
  // reserved for VCC etc.
  for (MCPhysReg Reg : AllSGPR128s) {
    // Pick the first unallocated one. Make sure we don't clobber the other
    // reserved input we needed.
    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
      MRI.replaceRegWith(ScratchRsrcReg, Reg);
      MFI->setScratchRSrcReg(Reg);
      return Reg;
    }
  }

  return ScratchRsrcReg;
}

// Shift down registers reserved for the scratch wave offset.
unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
    const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
    SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();

  assert(MFI->isEntryFunction());

  // No replacement necessary.
  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
      (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
    return AMDGPU::NoRegister;
  }

  if (ST.hasSGPRInitBug())
    return ScratchWaveOffsetReg;

  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();

  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
  if (NumPreloaded > AllSGPRs.size())
    return ScratchWaveOffsetReg;

  AllSGPRs = AllSGPRs.slice(NumPreloaded);

  // We need to drop register from the end of the list that we cannot use
  // for the scratch wave offset.
  // + 2 s102 and s103 do not exist on VI.
  // + 2 for vcc
  // + 2 for xnack_mask
  // + 2 for flat_scratch
  // + 4 for registers reserved for scratch resource register
  // + 1 for register reserved for scratch wave offset.  (By exluding this
  //     register from the list to consider, it means that when this
  //     register is being used for the scratch wave offset and there
  //     are no other free SGPRs, then the value will stay in this register.
  // + 1 if stack pointer is used.
  // ----
  //  13 (+1)
  unsigned ReservedRegCount = 13;

  if (AllSGPRs.size() < ReservedRegCount)
    return ScratchWaveOffsetReg;

  bool HandledScratchWaveOffsetReg =
    ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);

  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
    // Pick the first unallocated SGPR. Be careful not to pick an alias of the
    // scratch descriptor, since we haven’t added its uses yet.
    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
      if (!HandledScratchWaveOffsetReg) {
        HandledScratchWaveOffsetReg = true;

        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
        if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
          assert(!hasFP(MF));
          MFI->setStackPtrOffsetReg(Reg);
        }

        MFI->setScratchWaveOffsetReg(Reg);
        MFI->setFrameOffsetReg(Reg);
        ScratchWaveOffsetReg = Reg;
        break;
      }
    }
  }

  return ScratchWaveOffsetReg;
}

void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                MachineBasicBlock &MBB) const {
  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");

  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  // If we only have SGPR spills, we won't actually be using scratch memory
  // since these spill to VGPRs.
  //
  // FIXME: We should be cleaning up these unused SGPR spill frame indices
  // somewhere.

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const Function &F = MF.getFunction();

  // We need to do the replacement of the private segment buffer and wave offset
  // register even if there are no stack objects. There could be stores to undef
  // or a constant without an associated object.

  // FIXME: We still have implicit uses on SGPR spill instructions in case they
  // need to spill to vector memory. It's likely that will not happen, but at
  // this point it appears we need the setup. This part of the prolog should be
  // emitted after frame indices are eliminated.

  if (MFI->hasFlatScratchInit())
    emitFlatScratchInit(ST, MF, MBB);

  unsigned ScratchRsrcReg
    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);

  unsigned ScratchWaveOffsetReg =
      getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);

  // We need to insert initialization of the scratch resource descriptor.
  unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
    AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);

  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
  if (ST.isAmdHsaOrMesa(F)) {
    PreloadedPrivateBufferReg = MFI->getPreloadedReg(
      AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
  }

  bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
                       MRI.isPhysRegUsed(ScratchWaveOffsetReg);
  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
                         MRI.isPhysRegUsed(ScratchRsrcReg);

  // FIXME: Hack to not crash in situations which emitted an error.
  if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
    return;

  // We added live-ins during argument lowering, but since they were not used
  // they were deleted. We're adding the uses now, so add them back.
  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);

  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
    assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
    MRI.addLiveIn(PreloadedPrivateBufferReg);
    MBB.addLiveIn(PreloadedPrivateBufferReg);
  }

  // Make the register selected live throughout the function.
  for (MachineBasicBlock &OtherBB : MF) {
    if (&OtherBB == &MBB)
      continue;

    if (OffsetRegUsed)
      OtherBB.addLiveIn(ScratchWaveOffsetReg);

    if (ResourceRegUsed)
      OtherBB.addLiveIn(ScratchRsrcReg);
  }

  DebugLoc DL;
  MachineBasicBlock::iterator I = MBB.begin();

  // If we reserved the original input registers, we don't need to copy to the
  // reserved registers.

  bool CopyBuffer = ResourceRegUsed &&
    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
    ST.isAmdHsaOrMesa(F) &&
    ScratchRsrcReg != PreloadedPrivateBufferReg;

  // This needs to be careful of the copying order to avoid overwriting one of
  // the input registers before it's been copied to it's final
  // destination. Usually the offset should be copied first.
  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
                                              ScratchWaveOffsetReg);
  if (CopyBuffer && CopyBufferFirst) {
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
  }

  unsigned SPReg = MFI->getStackPtrOffsetReg();
  assert(SPReg != AMDGPU::SP_REG);

  // FIXME: Remove the isPhysRegUsed checks
  const bool HasFP = hasFP(MF);

  if (HasFP || OffsetRegUsed) {
    assert(ScratchWaveOffsetReg);
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
      .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
  }

  if (CopyBuffer && !CopyBufferFirst) {
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
  }

  if (ResourceRegUsed) {
    emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
        PreloadedPrivateBufferReg, ScratchRsrcReg);
  }

  if (HasFP) {
    DebugLoc DL;
    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
    int64_t StackSize = FrameInfo.getStackSize();

    // On kernel entry, the private scratch wave offset is the SP value.
    if (StackSize == 0) {
      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
        .addReg(MFI->getScratchWaveOffsetReg());
    } else {
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
        .addReg(MFI->getScratchWaveOffsetReg())
        .addImm(StackSize * ST.getWavefrontSize());
    }
  }
}

// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
      MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
      MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
      unsigned ScratchRsrcReg) const {

  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
  const Function &Fn = MF.getFunction();
  DebugLoc DL;

  if (ST.isAmdPalOS()) {
    // The pointer to the GIT is formed from the offset passed in and either
    // the amdgpu-git-ptr-high function attribute or the top part of the PC
    unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
    unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);

    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);

    if (MFI->getGITPtrHigh() != 0xffffffff) {
      BuildMI(MBB, I, DL, SMovB32, RsrcHi)
        .addImm(MFI->getGITPtrHigh())
        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
    } else {
      const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
      BuildMI(MBB, I, DL, GetPC64, Rsrc01);
    }
    auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
    if (ST.hasMergedShaders()) {
      switch (MF.getFunction().getCallingConv()) {
        case CallingConv::AMDGPU_HS:
        case CallingConv::AMDGPU_GS:
          // Low GIT address is passed in s8 rather than s0 for an LS+HS or
          // ES+GS merged shader on gfx9+.
          GitPtrLo = AMDGPU::SGPR8;
          break;
        default:
          break;
      }
    }
    MF.getRegInfo().addLiveIn(GitPtrLo);
    MBB.addLiveIn(GitPtrLo);
    BuildMI(MBB, I, DL, SMovB32, RsrcLo)
      .addReg(GitPtrLo)
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);

    // We now have the GIT ptr - now get the scratch descriptor from the entry
    // at offset 0 (or offset 16 for a compute shader).
    PointerType *PtrTy =
      PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
                       AMDGPUAS::CONSTANT_ADDRESS);
    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
    auto MMO = MF.getMachineMemOperand(PtrInfo,
                                       MachineMemOperand::MOLoad |
                                       MachineMemOperand::MOInvariant |
                                       MachineMemOperand::MODereferenceable,
                                       16, 4);
    unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
    const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
    unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
    BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
      .addReg(Rsrc01)
      .addImm(EncodedOffset) // offset
      .addImm(0) // glc
      .addImm(0) // dlc
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
      .addMemOperand(MMO);
    return;
  }
  if (ST.isMesaGfxShader(Fn)
      || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
    assert(!ST.isAmdHsaOrMesa(Fn));
    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);

    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);

    // Use relocations to get the pointer, and setup the other bits manually.
    uint64_t Rsrc23 = TII->getScratchRsrcWords23();

    if (MFI->hasImplicitBufferPtr()) {
      unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);

      if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
        const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);

        BuildMI(MBB, I, DL, Mov64, Rsrc01)
          .addReg(MFI->getImplicitBufferPtrUserSGPR())
          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
      } else {
        const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);

        PointerType *PtrTy =
          PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
                           AMDGPUAS::CONSTANT_ADDRESS);
        MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
        auto MMO = MF.getMachineMemOperand(PtrInfo,
                                           MachineMemOperand::MOLoad |
                                           MachineMemOperand::MOInvariant |
                                           MachineMemOperand::MODereferenceable,
                                           8, 4);
        BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
          .addReg(MFI->getImplicitBufferPtrUserSGPR())
          .addImm(0) // offset
          .addImm(0) // glc
          .addImm(0) // dlc
          .addMemOperand(MMO)
          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);

        MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
        MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
      }
    } else {
      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);

      BuildMI(MBB, I, DL, SMovB32, Rsrc0)
        .addExternalSymbol("SCRATCH_RSRC_DWORD0")
        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);

      BuildMI(MBB, I, DL, SMovB32, Rsrc1)
        .addExternalSymbol("SCRATCH_RSRC_DWORD1")
        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);

    }

    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
      .addImm(Rsrc23 & 0xffffffff)
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);

    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
      .addImm(Rsrc23 >> 32)
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
  }
}

// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer.  We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
// shrink wrapping), but then no longer be free when this is called from
// emitPrologue.
//
// FIXME: This is a bit conservative, since in the above case we could use one
// of the callee-save registers as a scratch temp to re-align the stack pointer,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
                                                 LivePhysRegs &LiveRegs,
                                                 const TargetRegisterClass &RC) {
  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();

  // Mark callee saved registers as used so we will not choose them.
  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
  for (unsigned i = 0; CSRegs[i]; ++i)
    LiveRegs.addReg(CSRegs[i]);

  MachineRegisterInfo &MRI = MF.getRegInfo();

  for (unsigned Reg : RC) {
    if (LiveRegs.available(MRI, Reg))
      return Reg;
  }

  return AMDGPU::NoRegister;
}

void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                   MachineBasicBlock &MBB) const {
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
  if (FuncInfo->isEntryFunction()) {
    emitEntryFunctionPrologue(MF, MBB);
    return;
  }

  const MachineFrameInfo &MFI = MF.getFrameInfo();
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
  LivePhysRegs LiveRegs;

  MachineBasicBlock::iterator MBBI = MBB.begin();
  DebugLoc DL;

  bool HasFP = false;
  uint32_t NumBytes = MFI.getStackSize();
  uint32_t RoundedSize = NumBytes;

  if (TRI.needsStackRealignment(MF)) {
    HasFP = true;
    const unsigned Alignment = MFI.getMaxAlignment();

    RoundedSize += Alignment;

    LiveRegs.init(TRI);
    LiveRegs.addLiveIns(MBB);

    unsigned ScratchSPReg
      = findScratchNonCalleeSaveRegister(MF, LiveRegs,
                                         AMDGPU::SReg_32_XM0RegClass);
    assert(ScratchSPReg != AMDGPU::NoRegister);

    // s_add_u32 tmp_reg, s32, NumBytes
    // s_and_b32 s32, tmp_reg, 0b111...0000
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
      .addReg(StackPtrReg)
      .addImm((Alignment - 1) * ST.getWavefrontSize())
      .setMIFlag(MachineInstr::FrameSetup);
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
      .addReg(ScratchSPReg, RegState::Kill)
      .addImm(-Alignment * ST.getWavefrontSize())
      .setMIFlag(MachineInstr::FrameSetup);
    FuncInfo->setIsStackRealigned(true);
  } else if ((HasFP = hasFP(MF))) {
    // If we need a base pointer, set it up here. It's whatever the value of
    // the stack pointer is at this point. Any variable size objects will be
    // allocated after this, so we can still use the base pointer to reference
    // locals.
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
      .addReg(StackPtrReg)
      .setMIFlag(MachineInstr::FrameSetup);
  }

  if (HasFP && RoundedSize != 0) {
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
      .addReg(StackPtrReg)
      .addImm(RoundedSize * ST.getWavefrontSize())
      .setMIFlag(MachineInstr::FrameSetup);
  }

  // To avoid clobbering VGPRs in lanes that weren't active on function entry,
  // turn on all lanes before doing the spill to memory.
  unsigned ScratchExecCopy = AMDGPU::NoRegister;

  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
         : FuncInfo->getSGPRSpillVGPRs()) {
    if (!Reg.FI.hasValue())
      continue;

    if (ScratchExecCopy == AMDGPU::NoRegister) {
      if (LiveRegs.empty()) {
        LiveRegs.init(TRI);
        LiveRegs.addLiveIns(MBB);
      }

      ScratchExecCopy
        = findScratchNonCalleeSaveRegister(MF, LiveRegs,
                                           AMDGPU::SReg_64_XEXECRegClass);

      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
              ScratchExecCopy)
        .addImm(-1);
    }

    TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
                             Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
                             &TII->getRegisterInfo());
  }

  if (ScratchExecCopy != AMDGPU::NoRegister) {
    // FIXME: Split block and make terminator.
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
      .addReg(ScratchExecCopy);
  }
}

void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                                   MachineBasicBlock &MBB) const {
  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
  if (FuncInfo->isEntryFunction())
    return;

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
  DebugLoc DL;

  unsigned ScratchExecCopy = AMDGPU::NoRegister;
  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
         : FuncInfo->getSGPRSpillVGPRs()) {
    if (!Reg.FI.hasValue())
      continue;

    if (ScratchExecCopy == AMDGPU::NoRegister) {
      // See emitPrologue
      LivePhysRegs LiveRegs(*ST.getRegisterInfo());
      LiveRegs.addLiveIns(MBB);

      ScratchExecCopy
        = findScratchNonCalleeSaveRegister(MF, LiveRegs,
                                           AMDGPU::SReg_64_XEXECRegClass);

      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
        .addImm(-1);
    }

    TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
                              &TII->getRegisterInfo());
  }

  if (ScratchExecCopy != AMDGPU::NoRegister) {
    // FIXME: Split block and make terminator.
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
      .addReg(ScratchExecCopy);
  }

  if (hasFP(MF)) {
    const MachineFrameInfo &MFI = MF.getFrameInfo();
    uint32_t NumBytes = MFI.getStackSize();
    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
      NumBytes + MFI.getMaxAlignment() : NumBytes;

    const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
      .addReg(StackPtrReg)
      .addImm(RoundedSize * ST.getWavefrontSize())
      .setMIFlag(MachineInstr::FrameDestroy);
  }
}

static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
       I != E; ++I) {
    if (!MFI.isDeadObjectIndex(I))
      return false;
  }

  return true;
}

int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                            unsigned &FrameReg) const {
  const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();

  FrameReg = RI->getFrameRegister(MF);
  return MF.getFrameInfo().getObjectOffset(FI);
}

void SIFrameLowering::processFunctionBeforeFrameFinalized(
  MachineFunction &MF,
  RegScavenger *RS) const {
  MachineFrameInfo &MFI = MF.getFrameInfo();

  if (!MFI.hasStackObjects())
    return;

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
  bool AllSGPRSpilledToVGPRs = false;

  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
    AllSGPRSpilledToVGPRs = true;

    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
    // are spilled to VGPRs, in which case we can eliminate the stack usage.
    //
    // XXX - This operates under the assumption that only other SGPR spills are
    // users of the frame index. I'm not 100% sure this is correct. The
    // StackColoring pass has a comment saying a future improvement would be to
    // merging of allocas with spill slots, but for now according to
    // MachineFrameInfo isSpillSlot can't alias any other object.
    for (MachineBasicBlock &MBB : MF) {
      MachineBasicBlock::iterator Next;
      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
        MachineInstr &MI = *I;
        Next = std::next(I);

        if (TII->isSGPRSpill(MI)) {
          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
          assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
            (void)Spilled;
            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
          } else
            AllSGPRSpilledToVGPRs = false;
        }
      }
    }
  }

  FuncInfo->removeSGPRToVGPRFrameIndices(MFI);

  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
  // but currently hasNonSpillStackObjects is set only from source
  // allocas. Stack temps produced from legalization are not counted currently.
  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
    assert(RS && "RegScavenger required if spilling");

    if (FuncInfo->isEntryFunction()) {
      int ScavengeFI = MFI.CreateFixedObject(
        TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
      RS->addScavengingFrameIndex(ScavengeFI);
    } else {
      int ScavengeFI = MFI.CreateStackObject(
        TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
        TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
        false);
      RS->addScavengingFrameIndex(ScavengeFI);
    }
  }
}

void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                                           RegScavenger *RS) const {
  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  // The SP is specifically managed and we don't want extra spills of it.
  SavedRegs.reset(MFI->getStackPtrOffsetReg());
}

MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
  MachineFunction &MF,
  MachineBasicBlock &MBB,
  MachineBasicBlock::iterator I) const {
  int64_t Amount = I->getOperand(0).getImm();
  if (Amount == 0)
    return MBB.erase(I);

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const DebugLoc &DL = I->getDebugLoc();
  unsigned Opc = I->getOpcode();
  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;

  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
  if (!TFI->hasReservedCallFrame(MF)) {
    unsigned Align = getStackAlignment();

    Amount = alignTo(Amount, Align);
    assert(isUInt<32>(Amount) && "exceeded stack address space size");
    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    unsigned SPReg = MFI->getStackPtrOffsetReg();

    unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
    BuildMI(MBB, I, DL, TII->get(Op), SPReg)
      .addReg(SPReg)
      .addImm(Amount * ST.getWavefrontSize());
  } else if (CalleePopAmount != 0) {
    llvm_unreachable("is this used?");
  }

  return MBB.erase(I);
}

bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
  const MachineFrameInfo &MFI = MF.getFrameInfo();
  if (MFI.hasCalls()) {
    // All offsets are unsigned, so need to be addressed in the same direction
    // as stack growth.
    if (MFI.getStackSize() != 0)
      return true;

    // For the entry point, the input wave scratch offset must be copied to the
    // API SP if there are calls.
    if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
      return true;

    // Retain behavior of always omitting the FP for leaf functions when
    // possible.
    if (MF.getTarget().Options.DisableFramePointerElim(MF))
      return true;
  }

  return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
    MFI.hasStackMap() || MFI.hasPatchPoint() ||
    MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF);
}
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								//===----------------------- SIFrameLowering.cpp --------------------------===//
 								//
-												Update the file headers across all of the LLVM projects in the monorepo
to reflect the new license.

We understand that people may be surprised that we're moving the header
entirely to discuss the new license. We checked this carefully with the
Foundation's lawyer and we believe this is the correct approach.

Essentially, all code in the project is now made available by the LLVM
project under our new license, so you will see that the license headers
include that license only. Some of our contributors have contributed
code under our old license, and accordingly, we have retained a copy of
our old license notice in the top-level files in each project and
repository.

llvm-svn: 351636

											
										
										
											2019-01-19 16:50:56 +08:00
+								// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 								// See https://llvm.org/LICENSE.txt for license information.
 								// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								//
 								//==-----------------------------------------------------------------------===//
 								#include "SIFrameLowering.h"
-												Sort the remaining #include lines in include/... and lib/....

I did this a long time ago with a janky python script, but now
clang-format has built-in support for this. I fed clang-format every
line with a #include and let it re-sort things according to the precise
LLVM rules for include ordering baked into clang-format these days.

I've reverted a number of files where the results of sorting includes
isn't healthy. Either places where we have legacy code relying on
particular include ordering (where possible, I'll fix these separately)
or where we have particular formatting around #include lines that
I didn't want to disturb in this patch.

This patch is *entirely* mechanical. If you get merge conflicts or
anything, just ignore the changes in this patch and run clang-format
over your #include lines in the files.

Sorry for any noise here, but it is important to keep these things
stable. I was seeing an increasing number of patches with irrelevant
re-ordering of #include lines because clang-format was used. This patch
at least isolates that churn, makes it easy to skip when resolving
conflicts, and gets us to a clean baseline (again).

llvm-svn: 304787

											
										
										
											2017-06-06 19:49:48 +08:00
+								#include "AMDGPUSubtarget.h"
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								#include "SIInstrInfo.h"
 								#include "SIMachineFunctionInfo.h"
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								#include "SIRegisterInfo.h"
-												AMDGPU: Remove #include "MCTargetDesc/AMDGPUMCTargetDesc.h" from common headers

Summary:
MCTargetDesc/AMDGPUMCTargetDesc.h contains enums for all the instuction
and register defintions, which are huge so we only want to include
them where needed.

This will also make it easier if we want to split the R600 and GCN
definitions into separate tablegenerated files.

I was unable to remove AMDGPUMCTargetDesc.h from SIMachineFunctionInfo.h
because it uses some enums from the header to initialize default values
for the SIMachineFunction class, so I ended up having to remove includes of
SIMachineFunctionInfo.h from headers too.

Reviewers: arsenm, nhaehnle

Reviewed By: nhaehnle

Subscribers: MatzeB, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D46272

llvm-svn: 332930

											
										
										
											2018-05-22 10:03:23 +08:00
+								#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-												AMDGPU: Cleanup subtarget handling.

Split AMDGPUSubtarget into amdgcn/r600 specific subclasses.
This removes most of the static_casting of the basic codegen
classes everywhere, and tries to restrict the features
visible on the wrong target.

llvm-svn: 273652

											
										
										
											2016-06-24 14:30:11 +08:00
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								#include "llvm/CodeGen/LivePhysRegs.h"
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								#include "llvm/CodeGen/MachineFrameInfo.h"
 								#include "llvm/CodeGen/MachineFunction.h"
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								#include "llvm/CodeGen/MachineInstrBuilder.h"
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								#include "llvm/CodeGen/RegisterScavenging.h"
 								using namespace llvm;
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
-												[AMDGPU] Move register related queries to subtarget class

Differential Revision: https://reviews.llvm.org/D29318

llvm-svn: 294440

											
										
										
											2017-02-08 21:02:33 +08:00
+								                                         const MachineFunction &MF) {
-												AMDGPU: Fix assert on ttmp registers

Use register class that does not include them when looking
for unallocated registers.

This is hit by the udiv v8i64 test in the opencl integer
conformance test, and takes a few seconds to compile in
a debug build so no test included.

llvm-svn: 269938

											
										
										
											2016-05-18 23:19:50 +08:00
+								  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
-												[AMDGPU] Move register related queries to subtarget class

Differential Revision: https://reviews.llvm.org/D29318

llvm-svn: 294440

											
										
										
											2017-02-08 21:02:33 +08:00
+								                      ST.getMaxNumSGPRs(MF) / 4);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								}
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
-												[AMDGPU] Move register related queries to subtarget class

Differential Revision: https://reviews.llvm.org/D29318

llvm-svn: 294440

											
										
										
											2017-02-08 21:02:33 +08:00
+								                                       const MachineFunction &MF) {
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-												[AMDGPU] Move register related queries to subtarget class

Differential Revision: https://reviews.llvm.org/D29318

llvm-svn: 294440

											
										
										
											2017-02-08 21:02:33 +08:00
+								                      ST.getMaxNumSGPRs(MF));
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								}
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								                                          MachineFunction &MF,
 								                                          MachineBasicBlock &MBB) const {
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
+								  const SIInstrInfo *TII = ST.getInstrInfo();
 								  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
-												AMDGPU: Pass special input registers to functions

llvm-svn: 309998

											
										
										
											2017-08-04 07:00:29 +08:00
+								  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  // We don't need this if we only have spills since there is no user facing
 								  // scratch.
 								  // TODO: If we know we don't have flat instructions earlier, we can omit
 								  // this from the input registers.
 								  //
 								  // TODO: We only need to know if we access scratch space through a flat
 								  // pointer. Because we only detect if flat instructions are used at all,
 								  // this will be used more often than necessary on VI.
 								  // Debug location must be unknown since the first debug location is used to
 								  // determine the end of the prologue.
 								  DebugLoc DL;
 								  MachineBasicBlock::iterator I = MBB.begin();
 								  unsigned FlatScratchInitReg
-												AMDGPU: Pass special input registers to functions

llvm-svn: 309998

											
										
										
											2017-08-04 07:00:29 +08:00
+								    = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
 								  MachineRegisterInfo &MRI = MF.getRegInfo();
 								  MRI.addLiveIn(FlatScratchInitReg);
 								  MBB.addLiveIn(FlatScratchInitReg);
 								  unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
+								  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
 								  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
+								  // Do a 64-bit pointer add.
 								  if (ST.flatScratchIsPointer()) {
-												[AMDGPU] gfx1010 VMEM and SMEM implementation

Differential Revision: https://reviews.llvm.org/D61330

llvm-svn: 359621

											
										
										
											2019-05-01 06:08:23 +08:00
+								    if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
 								      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
 								        .addReg(FlatScrInitLo)
 								        .addReg(ScratchWaveOffsetReg);
 								      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
 								        .addReg(FlatScrInitHi)
 								        .addImm(0);
 								      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
 								        addReg(FlatScrInitLo).
 								        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
 								                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
 								      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
 								        addReg(FlatScrInitHi).
 								        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
 								                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
 								      return;
 								    }
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
+								    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
 								      .addReg(FlatScrInitLo)
 								      .addReg(ScratchWaveOffsetReg);
 								    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
 								      .addReg(FlatScrInitHi)
 								      .addImm(0);
 								    return;
 								  }
-												[AMDGPU] gfx1010 VMEM and SMEM implementation

Differential Revision: https://reviews.llvm.org/D61330

llvm-svn: 359621

											
										
										
											2019-05-01 06:08:23 +08:00
+								  assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
+								  // Copy the size in bytes.
 								  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
 								    .addReg(FlatScrInitHi, RegState::Kill);
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  // Add wave offset in bytes to private base offset.
 								  // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
 								  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
 								    .addReg(FlatScrInitLo)
 								    .addReg(ScratchWaveOffsetReg);
 								  // Convert offset to 256-byte units.
 								  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
 								    .addReg(FlatScrInitLo, RegState::Kill)
 								    .addImm(8);
 								}
 								unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								  const GCNSubtarget &ST,
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  const SIInstrInfo *TII,
 								  const SIRegisterInfo *TRI,
 								  SIMachineFunctionInfo *MFI,
 								  MachineFunction &MF) const {
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								  MachineRegisterInfo &MRI = MF.getRegInfo();
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
 								  // We need to insert initialization of the scratch resource descriptor.
 								  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								  if (ScratchRsrcReg == AMDGPU::NoRegister ||
 								      !MRI.isPhysRegUsed(ScratchRsrcReg))
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								    return AMDGPU::NoRegister;
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
 								  if (ST.hasSGPRInitBug() ||
 								      ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
 								    return ScratchRsrcReg;
 								  // We reserved the last registers for this. Shift it down to the end of those
 								  // which were actually used.
 								  //
 								  // FIXME: It might be safer to use a pseudoregister before replacement.
 								  // FIXME: We should be able to eliminate unused input registers. We only
 								  // cannot do this for the resources required for scratch access. For now we
 								  // skip over user SGPRs and may leave unused holes.
 								  // We find the resource first because it has an alignment requirement.
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
-												[AMDGPU] Move register related queries to subtarget class

Differential Revision: https://reviews.llvm.org/D29318

llvm-svn: 294440

											
										
										
											2017-02-08 21:02:33 +08:00
+								  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
-												AMDGPU: Don't use stack space for SGPR->VGPR spills

Before frame offsets are calculated, try to eliminate the
frame indexes used by SGPR spills. Then we can delete them
after.

I think for now we can be sure that no other instruction
will be re-using the same frame indexes. It should be easy
to notice if this assumption ever breaks since everything
asserts if it tries to use a dead frame index later.

The unused emergency stack slot seems to still be left behind,
so an additional 4 bytes is still wasted.

llvm-svn: 295753

											
										
										
											2017-02-22 03:12:08 +08:00
+								  // Skip the last N reserved elements because they should have already been
 								  // reserved for VCC etc.
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  for (MCPhysReg Reg : AllSGPR128s) {
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								    // Pick the first unallocated one. Make sure we don't clobber the other
 								    // reserved input we needed.
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								      MRI.replaceRegWith(ScratchRsrcReg, Reg);
 								      MFI->setScratchRSrcReg(Reg);
 								      return Reg;
 								    }
 								  }
 								  return ScratchRsrcReg;
 								}
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								// Shift down registers reserved for the scratch wave offset.
 								unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
 								    const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
 								    SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								  MachineRegisterInfo &MRI = MF.getRegInfo();
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  assert(MFI->isEntryFunction());
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								  // No replacement necessary.
 								  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								      (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
 								    return AMDGPU::NoRegister;
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								  }
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								  if (ST.hasSGPRInitBug())
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								    return ScratchWaveOffsetReg;
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
 								  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-												[AMDGPU] Move register related queries to subtarget class

Differential Revision: https://reviews.llvm.org/D29318

llvm-svn: 294440

											
										
										
											2017-02-08 21:02:33 +08:00
+								  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  if (NumPreloaded > AllSGPRs.size())
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								    return ScratchWaveOffsetReg;
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
 								  AllSGPRs = AllSGPRs.slice(NumPreloaded);
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  // We need to drop register from the end of the list that we cannot use
 								  // for the scratch wave offset.
 								  // + 2 s102 and s103 do not exist on VI.
 								  // + 2 for vcc
 								  // + 2 for xnack_mask
 								  // + 2 for flat_scratch
 								  // + 4 for registers reserved for scratch resource register
 								  // + 1 for register reserved for scratch wave offset.  (By exluding this
 								  //     register from the list to consider, it means that when this
 								  //     register is being used for the scratch wave offset and there
 								  //     are no other free SGPRs, then the value will stay in this register.
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								  // + 1 if stack pointer is used.
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  // ----
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								  //  13 (+1)
 								  unsigned ReservedRegCount = 13;
 								  if (AllSGPRs.size() < ReservedRegCount)
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								    return ScratchWaveOffsetReg;
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
 								  bool HandledScratchWaveOffsetReg =
 								    ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								    // Pick the first unallocated SGPR. Be careful not to pick an alias of the
 								    // scratch descriptor, since we haven’t added its uses yet.
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								      if (!HandledScratchWaveOffsetReg) {
 								        HandledScratchWaveOffsetReg = true;
 								        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								        if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
 								          assert(!hasFP(MF));
 								          MFI->setStackPtrOffsetReg(Reg);
 								        }
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								        MFI->setScratchWaveOffsetReg(Reg);
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								        MFI->setFrameOffsetReg(Reg);
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
+								        ScratchWaveOffsetReg = Reg;
 								        break;
 								      }
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								    }
 								  }
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  return ScratchWaveOffsetReg;
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								}
-												AMDGPU: Start defining a calling convention

Partially implement callee-side for arguments and return values.
byval doesn't work properly, and most likely sret or other on-stack
return values most as well.

llvm-svn: 303308

											
										
										
											2017-05-18 05:56:25 +08:00
+								void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 								                                                MachineBasicBlock &MBB) const {
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
 								  // If we only have SGPR spills, we won't actually be using scratch memory
 								  // since these spill to VGPRs.
 								  //
 								  // FIXME: We should be cleaning up these unused SGPR spill frame indices
 								  // somewhere.
-												AMDGPU: Remove debugger related subtarget features

As far as I know these aren't needed anymore.

llvm-svn: 354634

											
										
										
											2019-02-22 07:27:46 +08:00
+								  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-												AMDGPU: Cleanup subtarget handling.

Split AMDGPUSubtarget into amdgcn/r600 specific subclasses.
This removes most of the static_casting of the basic codegen
classes everywhere, and tries to restrict the features
visible on the wrong target.

llvm-svn: 273652

											
										
										
											2016-06-24 14:30:11 +08:00
+								  const SIInstrInfo *TII = ST.getInstrInfo();
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-												AMDGPU: Set flat_scratch from flat_scratch_init reg

This was hardcoded to the static private size, but this
would be missing the offset and additional size for someday
when we have dynamic sizing.

Also stops always initializing flat_scratch even when unused.

In the future we should stop emitting this unless flat instructions
are used to access private memory. For example this will initialize
it almost always on VI because flat is used for global access.

llvm-svn: 260658

											
										
										
											2016-02-12 14:31:30 +08:00
+								  MachineRegisterInfo &MRI = MF.getRegInfo();
-												AMDGPU: Pass function directly instead of MachineFunction

These functions just query the underlying IR function,
so pass it directly.

llvm-svn: 333442

											
										
										
											2018-05-30 01:42:50 +08:00
+								  const Function &F = MF.getFunction();
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  // We need to do the replacement of the private segment buffer and wave offset
 								  // register even if there are no stack objects. There could be stores to undef
 								  // or a constant without an associated object.
 								  // FIXME: We still have implicit uses on SGPR spill instructions in case they
 								  // need to spill to vector memory. It's likely that will not happen, but at
 								  // this point it appears we need the setup. This part of the prolog should be
 								  // emitted after frame indices are eliminated.
-												AMDGPU: Annotate necessity of flat-scratch-init

As an approximation of the existing handling to avoid
regressions. Fixes using too many registers with calls
on subtargets with the SGPR allocation bug.

llvm-svn: 308326

											
										
										
											2017-07-19 00:44:58 +08:00
+								  if (MFI->hasFlatScratchInit())
-												AMDGPU: Merge initial gfx9 support

llvm-svn: 295554

											
										
										
											2017-02-19 02:29:53 +08:00
+								    emitFlatScratchInit(ST, MF, MBB);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								  unsigned ScratchRsrcReg
 								    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
-												AMDGPU: Shift down reserved SP register like scratch wave offset

llvm-svn: 301367

											
										
										
											2017-04-26 07:40:57 +08:00
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  unsigned ScratchWaveOffsetReg =
 								      getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  // We need to insert initialization of the scratch resource descriptor.
-												AMDGPU: Pass special input registers to functions

llvm-svn: 309998

											
										
										
											2017-08-04 07:00:29 +08:00
+								  unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
 								    AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
 								  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-												AMDGPU: Rename isAmdCodeObjectV2 -> isAmdHsaOrMesa

The isAmdCodeObjectV2 is a misleading name which actually checks whether the os
is amdhsa or mesa.

Also add a test to make sure we do not generate old kernel header for code
object v3.

Differential Revision: https://reviews.llvm.org/D52897

llvm-svn: 343813

											
										
										
											2018-10-05 05:02:16 +08:00
+								  if (ST.isAmdHsaOrMesa(F)) {
-												AMDGPU: Pass special input registers to functions

llvm-svn: 309998

											
										
										
											2017-08-04 07:00:29 +08:00
+								    PreloadedPrivateBufferReg = MFI->getPreloadedReg(
 								      AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								  }
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
 								                       MRI.isPhysRegUsed(ScratchWaveOffsetReg);
-												AMDGPU: Slightly simplify prolog reserved register handling

Rely on MachineRegisterInfo's knowledge of used physical
registers.

Move flat_scratch initialization earlier, so the uses are visible
when making these decisions.

This will make it easier to add another reserved register
at the end for the stack pointer rather than handling another
special case.

llvm-svn: 301254

											
										
										
											2017-04-25 05:08:32 +08:00
+								  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
 								                         MRI.isPhysRegUsed(ScratchRsrcReg);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  // FIXME: Hack to not crash in situations which emitted an error.
 								  if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
 								    return;
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								  // We added live-ins during argument lowering, but since they were not used
 								  // they were deleted. We're adding the uses now, so add them back.
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
 								  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
-												AMDGPU: Rename isAmdCodeObjectV2 -> isAmdHsaOrMesa

The isAmdCodeObjectV2 is a misleading name which actually checks whether the os
is amdhsa or mesa.

Also add a test to make sure we do not generate old kernel header for code
object v3.

Differential Revision: https://reviews.llvm.org/D52897

llvm-svn: 343813

											
										
										
											2018-10-05 05:02:16 +08:00
+								    assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								    MRI.addLiveIn(PreloadedPrivateBufferReg);
 								    MBB.addLiveIn(PreloadedPrivateBufferReg);
 								  }
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  // Make the register selected live throughout the function.
 								  for (MachineBasicBlock &OtherBB : MF) {
 								    if (&OtherBB == &MBB)
 								      continue;
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								    if (OffsetRegUsed)
 								      OtherBB.addLiveIn(ScratchWaveOffsetReg);
 								    if (ResourceRegUsed)
 								      OtherBB.addLiveIn(ScratchRsrcReg);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								  }
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								  DebugLoc DL;
-												AMDGPU: Refactor frame lowering

This will make future changes easier.

llvm-svn: 280296

											
										
										
											2016-09-01 05:52:21 +08:00
+								  MachineBasicBlock::iterator I = MBB.begin();
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  // If we reserved the original input registers, we don't need to copy to the
 								  // reserved registers.
 								  bool CopyBuffer = ResourceRegUsed &&
 								    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
-												AMDGPU: Rename isAmdCodeObjectV2 -> isAmdHsaOrMesa

The isAmdCodeObjectV2 is a misleading name which actually checks whether the os
is amdhsa or mesa.

Also add a test to make sure we do not generate old kernel header for code
object v3.

Differential Revision: https://reviews.llvm.org/D52897

llvm-svn: 343813

											
										
										
											2018-10-05 05:02:16 +08:00
+								    ST.isAmdHsaOrMesa(F) &&
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								    ScratchRsrcReg != PreloadedPrivateBufferReg;
 								  // This needs to be careful of the copying order to avoid overwriting one of
 								  // the input registers before it's been copied to it's final
 								  // destination. Usually the offset should be copied first.
 								  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
 								                                              ScratchWaveOffsetReg);
 								  if (CopyBuffer && CopyBufferFirst) {
 								    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
 								      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
 								  }
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  unsigned SPReg = MFI->getStackPtrOffsetReg();
 								  assert(SPReg != AMDGPU::SP_REG);
 								  // FIXME: Remove the isPhysRegUsed checks
 								  const bool HasFP = hasFP(MF);
 								  if (HasFP || OffsetRegUsed) {
 								    assert(ScratchWaveOffsetReg);
-												AMDGPU: Use copy instead of mov during frame lowering

This occurs before RA pseudos are expanded. It's less
code to emit the copy.

llvm-svn: 280297

											
										
										
											2016-09-01 05:52:25 +08:00
+								    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								      .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								  }
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  if (CopyBuffer && !CopyBufferFirst) {
-												AMDGPU: Use copy instead of mov during frame lowering

This occurs before RA pseudos are expanded. It's less
code to emit the copy.

llvm-svn: 280297

											
										
										
											2016-09-01 05:52:25 +08:00
+								    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
 								      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
-												AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

llvm-svn: 285435

											
										
										
											2016-10-29 03:43:31 +08:00
+								  }
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  if (ResourceRegUsed) {
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								    emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
 								        PreloadedPrivateBufferReg, ScratchRsrcReg);
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  }
 								  if (HasFP) {
 								    DebugLoc DL;
 								    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 								    int64_t StackSize = FrameInfo.getStackSize();
 								    // On kernel entry, the private scratch wave offset is the SP value.
 								    if (StackSize == 0) {
 								      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
 								        .addReg(MFI->getScratchWaveOffsetReg());
 								    } else {
 								      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
 								        .addReg(MFI->getScratchWaveOffsetReg())
 								        .addImm(StackSize * ST.getWavefrontSize());
 								    }
 								  }
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								}
 								// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								      MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
 								      MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
 								      unsigned ScratchRsrcReg) const {
 								  const SIInstrInfo *TII = ST.getInstrInfo();
 								  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-												AMDGPU: Pass function directly instead of MachineFunction

These functions just query the underlying IR function,
so pass it directly.

llvm-svn: 333442

											
										
										
											2018-05-30 01:42:50 +08:00
+								  const Function &Fn = MF.getFunction();
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								  DebugLoc DL;
 								  if (ST.isAmdPalOS()) {
 								    // The pointer to the GIT is formed from the offset passed in and either
 								    // the amdgpu-git-ptr-high function attribute or the top part of the PC
 								    unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
 								    unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
 								    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 								    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 								    if (MFI->getGITPtrHigh() != 0xffffffff) {
 								      BuildMI(MBB, I, DL, SMovB32, RsrcHi)
 								        .addImm(MFI->getGITPtrHigh())
 								        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								    } else {
 								      const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
 								      BuildMI(MBB, I, DL, GetPC64, Rsrc01);
 								    }
-												[AMDGPU] Scratch setup fix on AMDPAL gfx9+ merge shader

Summary:
With OS type AMDPAL, the scratch descriptor is hardwired to be loaded
from offset 0 of the global information table, whose low pointer is
passed in s0. For a merge shader on gfx9+, it needs to be s8 instead, as
the hardware reserves s0-s7.

Reviewers: kzhuravl

Subscribers: arsenm, nhaehnle, dstuttard, llvm-commits, t-tye, yaxunl, wdng, kzhuravl

Differential Revision: https://reviews.llvm.org/D42203

llvm-svn: 326088

											
										
										
											2018-02-26 22:46:43 +08:00
+								    auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
 								    if (ST.hasMergedShaders()) {
 								      switch (MF.getFunction().getCallingConv()) {
 								        case CallingConv::AMDGPU_HS:
 								        case CallingConv::AMDGPU_GS:
 								          // Low GIT address is passed in s8 rather than s0 for an LS+HS or
 								          // ES+GS merged shader on gfx9+.
 								          GitPtrLo = AMDGPU::SGPR8;
 								          break;
 								        default:
 								          break;
 								      }
 								    }
-												[AMDGPU] For OS type AMDPAL, fixed scratch on compute shader

Summary:
For OS type AMDPAL, the scratch descriptor is loaded from offset 0 of
the GIT, whose 32 bit pointer is in s0 (s8 for gfx9 merged shaders).

This commit fixes that to use offset 0x10 instead of offset 0 for a
compute shader, per the PAL ABI spec.

V2: Ensure s0 (s8 for gfx9 merged shader) is marked live-in when loading
scratch descriptor from GIT.

Reviewers: kzhuravl, nhaehnle, timcorringham

Subscribers: kzhuravl, wdng, yaxunl, t-tye, llvm-commits, dstuttard, nhaehnle, arsenm

Differential Revision: https://reviews.llvm.org/D44468

Change-Id: I93dffa647758e37f613bb5e0dfca840d82e6d26f
llvm-svn: 329690

											
										
										
											2018-04-10 19:25:15 +08:00
+								    MF.getRegInfo().addLiveIn(GitPtrLo);
-												AMDGPU: Fix not adding ImplicitBufferPtr as a live-in

Fixes missing test from r293000.

llvm-svn: 362275

											
										
										
											2019-06-01 06:47:36 +08:00
+								    MBB.addLiveIn(GitPtrLo);
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								    BuildMI(MBB, I, DL, SMovB32, RsrcLo)
-												[AMDGPU] Scratch setup fix on AMDPAL gfx9+ merge shader

Summary:
With OS type AMDPAL, the scratch descriptor is hardwired to be loaded
from offset 0 of the global information table, whose low pointer is
passed in s0. For a merge shader on gfx9+, it needs to be s8 instead, as
the hardware reserves s0-s7.

Reviewers: kzhuravl

Subscribers: arsenm, nhaehnle, dstuttard, llvm-commits, t-tye, yaxunl, wdng, kzhuravl

Differential Revision: https://reviews.llvm.org/D42203

llvm-svn: 326088

											
										
										
											2018-02-26 22:46:43 +08:00
+								      .addReg(GitPtrLo)
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								    // We now have the GIT ptr - now get the scratch descriptor from the entry
-												[AMDGPU] For OS type AMDPAL, fixed scratch on compute shader

Summary:
For OS type AMDPAL, the scratch descriptor is loaded from offset 0 of
the GIT, whose 32 bit pointer is in s0 (s8 for gfx9 merged shaders).

This commit fixes that to use offset 0x10 instead of offset 0 for a
compute shader, per the PAL ABI spec.

V2: Ensure s0 (s8 for gfx9 merged shader) is marked live-in when loading
scratch descriptor from GIT.

Reviewers: kzhuravl, nhaehnle, timcorringham

Subscribers: kzhuravl, wdng, yaxunl, t-tye, llvm-commits, dstuttard, nhaehnle, arsenm

Differential Revision: https://reviews.llvm.org/D44468

Change-Id: I93dffa647758e37f613bb5e0dfca840d82e6d26f
llvm-svn: 329690

											
										
										
											2018-04-10 19:25:15 +08:00
+								    // at offset 0 (or offset 16 for a compute shader).
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								    PointerType *PtrTy =
-												MachineFunction: Return reference from getFunction(); NFC

The Function can never be nullptr so we can return a reference.

llvm-svn: 320884

											
										
										
											2017-12-16 06:22:58 +08:00
+								      PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								                       AMDGPUAS::CONSTANT_ADDRESS);
 								    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 								    const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
 								    auto MMO = MF.getMachineMemOperand(PtrInfo,
 								                                       MachineMemOperand::MOLoad |
 								                                       MachineMemOperand::MOInvariant |
 								                                       MachineMemOperand::MODereferenceable,
-												GlobalISel: Fix creating MMOs with align 0

llvm-svn: 352712

											
										
										
											2019-01-31 09:38:47 +08:00
+, 4);
-												AMDGPU: Pass function directly instead of MachineFunction

These functions just query the underlying IR function,
so pass it directly.

llvm-svn: 333442

											
										
										
											2018-05-30 01:42:50 +08:00
+								    unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
-												[AMDGPU] Fix CS scratch setup on pre-GCN3 ASICs

Summary:
Prior to GCN3 s_load_dword offsets are in dwords rather than bytes.
Thus the scratch buffer descriptor offset must be adjusted for pre-GCN3 ASICs.

Reviewers: nhaehnle, tpr

Reviewed By: nhaehnle

Subscribers: sheredom, arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, jfb, llvm-commits

Differential Revision: https://reviews.llvm.org/D56496

llvm-svn: 353530

											
										
										
											2019-02-08 23:41:11 +08:00
+								    const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
 								    unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								    BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
 								      .addReg(Rsrc01)
-												[AMDGPU] Fix CS scratch setup on pre-GCN3 ASICs

Summary:
Prior to GCN3 s_load_dword offsets are in dwords rather than bytes.
Thus the scratch buffer descriptor offset must be adjusted for pre-GCN3 ASICs.

Reviewers: nhaehnle, tpr

Reviewed By: nhaehnle

Subscribers: sheredom, arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, jfb, llvm-commits

Differential Revision: https://reviews.llvm.org/D56496

llvm-svn: 353530

											
										
										
											2019-02-08 23:41:11 +08:00
+								      .addImm(EncodedOffset) // offset
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								      .addImm(0) // glc
-												[AMDGPU] gfx1010 VMEM and SMEM implementation

Differential Revision: https://reviews.llvm.org/D61330

llvm-svn: 359621

											
										
										
											2019-05-01 06:08:23 +08:00
+								      .addImm(0) // dlc
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								      .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
 								      .addMemOperand(MMO);
 								    return;
 								  }
-												AMDGPU: Pass function directly instead of MachineFunction

These functions just query the underlying IR function,
so pass it directly.

llvm-svn: 333442

											
										
										
											2018-05-30 01:42:50 +08:00
+								  if (ST.isMesaGfxShader(Fn)
-												[AMDGPU] AMDPAL scratch buffer support

Summary:
Added support for scratch (including spilling) for OS type amdpal:
generates code to set up the scratch descriptor if it is needed.

With amdpal, the scratch resource descriptor is loaded from offset 0 of
the global information table. The low 32 bits of the address of the
global information table is passed in s0.

Added amdgpu-git-ptr-high function attribute to hard-wire the high 32
bits of the address of the global information table. If the function
attribute is not specified, or is 0xffffffff, then the backend generates
code to use the high 32 bits of pc.

The documentation for the AMDPAL ABI will be added in a later commit.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye

Differential Revision: https://reviews.llvm.org/D37483

llvm-svn: 314501

											
										
										
											2017-09-29 17:49:35 +08:00
+								      || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
-												AMDGPU: Rename isAmdCodeObjectV2 -> isAmdHsaOrMesa

The isAmdCodeObjectV2 is a misleading name which actually checks whether the os
is amdhsa or mesa.

Also add a test to make sure we do not generate old kernel header for code
object v3.

Differential Revision: https://reviews.llvm.org/D52897

llvm-svn: 343813

											
										
										
											2018-10-05 05:02:16 +08:00
+								    assert(!ST.isAmdHsaOrMesa(Fn));
-												AMDGPU: Use copy instead of mov during frame lowering

This occurs before RA pseudos are expanded. It's less
code to emit the copy.

llvm-svn: 280297

											
										
										
											2016-09-01 05:52:25 +08:00
+								    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
+								    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
 								    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
 								    // Use relocations to get the pointer, and setup the other bits manually.
 								    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-												AMDGPU: Partially fix implicit.buffer.ptr intrinsic handling

This should not be treated as a different version of
private_segment_buffer. These are distinct things with
different uses and register classes, and requires the
function argument info to have more context about the
function's type and environment.

Also add missing test coverage for the intrinsic, and
emit an error for HSA. This also encovers that the intrinsic
is broken unless there happen to be stack objects.

llvm-svn: 306264

											
										
										
											2017-06-26 11:01:31 +08:00
+								    if (MFI->hasImplicitBufferPtr()) {
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								      unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
-												MachineFunction: Return reference from getFunction(); NFC

The Function can never be nullptr so we can return a reference.

llvm-svn: 320884

											
										
										
											2017-12-16 06:22:58 +08:00
+								      if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								        const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
 								        BuildMI(MBB, I, DL, Mov64, Rsrc01)
-												AMDGPU: Partially fix implicit.buffer.ptr intrinsic handling

This should not be treated as a different version of
private_segment_buffer. These are distinct things with
different uses and register classes, and requires the
function argument info to have more context about the
function's type and environment.

Also add missing test coverage for the intrinsic, and
emit an error for HSA. This also encovers that the intrinsic
is broken unless there happen to be stack objects.

llvm-svn: 306264

											
										
										
											2017-06-26 11:01:31 +08:00
+								          .addReg(MFI->getImplicitBufferPtrUserSGPR())
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								      } else {
 								        const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
 								        PointerType *PtrTy =
-												MachineFunction: Return reference from getFunction(); NFC

The Function can never be nullptr so we can return a reference.

llvm-svn: 320884

											
										
										
											2017-12-16 06:22:58 +08:00
+								          PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
-												AMDGPU: Fix set but not used warnings related to AMDGPUAS

Differential Revision: https://reviews.llvm.org/D39499

llvm-svn: 317114

											
										
										
											2017-11-02 03:12:38 +08:00
+								                           AMDGPUAS::CONSTANT_ADDRESS);
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								        MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 								        auto MMO = MF.getMachineMemOperand(PtrInfo,
 								                                           MachineMemOperand::MOLoad |
 								                                           MachineMemOperand::MOInvariant |
 								                                           MachineMemOperand::MODereferenceable,
-												GlobalISel: Fix creating MMOs with align 0

llvm-svn: 352712

											
										
										
											2019-01-31 09:38:47 +08:00
+, 4);
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								        BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
-												AMDGPU: Partially fix implicit.buffer.ptr intrinsic handling

This should not be treated as a different version of
private_segment_buffer. These are distinct things with
different uses and register classes, and requires the
function argument info to have more context about the
function's type and environment.

Also add missing test coverage for the intrinsic, and
emit an error for HSA. This also encovers that the intrinsic
is broken unless there happen to be stack objects.

llvm-svn: 306264

											
										
										
											2017-06-26 11:01:31 +08:00
+								          .addReg(MFI->getImplicitBufferPtrUserSGPR())
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								          .addImm(0) // offset
 								          .addImm(0) // glc
-												[AMDGPU] gfx1010 VMEM and SMEM implementation

Differential Revision: https://reviews.llvm.org/D61330

llvm-svn: 359621

											
										
										
											2019-05-01 06:08:23 +08:00
+								          .addImm(0) // dlc
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								          .addMemOperand(MMO)
 								          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-												AMDGPU: Fix not adding ImplicitBufferPtr as a live-in

Fixes missing test from r293000.

llvm-svn: 362275

											
										
										
											2019-06-01 06:47:36 +08:00
 								        MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
 								        MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
-												AMDGPU add support for spilling to a user sgpr pointed buffers

Summary:
This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1].

Patch By: Dave Airlie

Reviewers: nhaehnle, arsenm, tstellarAMD

Reviewed By: arsenm

Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye

Differential Revision: https://reviews.llvm.org/D25428

llvm-svn: 293000

											
										
										
											2017-01-25 09:25:13 +08:00
+								      }
 								    } else {
 								      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
 								      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
 								      BuildMI(MBB, I, DL, SMovB32, Rsrc0)
 								        .addExternalSymbol("SCRATCH_RSRC_DWORD0")
 								        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								      BuildMI(MBB, I, DL, SMovB32, Rsrc1)
 								        .addExternalSymbol("SCRATCH_RSRC_DWORD1")
 								        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								    }
-												AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

llvm-svn: 254331

											
										
										
											2015-12-01 05:16:03 +08:00
 								    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
 								      .addImm(Rsrc23 & 0xffffffff)
 								      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
 								      .addImm(Rsrc23 >> 32)
 								      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 								  }
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								}
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								// Find a scratch register that we can use at the start of the prologue to
 								// re-align the stack pointer.  We avoid using callee-save registers since they
 								// may appear to be free when this is called from canUseAsPrologue (during
 								// shrink wrapping), but then no longer be free when this is called from
 								// emitPrologue.
 								//
 								// FIXME: This is a bit conservative, since in the above case we could use one
 								// of the callee-save registers as a scratch temp to re-align the stack pointer,
 								// but we would then have to make sure that we were in fact saving at least one
 								// callee-save register in the prologue, which is additional complexity that
 								// doesn't seem worth the benefit.
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
 								                                                 LivePhysRegs &LiveRegs,
 								                                                 const TargetRegisterClass &RC) {
 								  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
 								  // Mark callee saved registers as used so we will not choose them.
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								  for (unsigned i = 0; CSRegs[i]; ++i)
 								    LiveRegs.addReg(CSRegs[i]);
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								  MachineRegisterInfo &MRI = MF.getRegInfo();
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								  for (unsigned Reg : RC) {
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								    if (LiveRegs.available(MRI, Reg))
 								      return Reg;
 								  }
 								  return AMDGPU::NoRegister;
 								}
-												AMDGPU: Start defining a calling convention

Partially implement callee-side for arguments and return values.
byval doesn't work properly, and most likely sret or other on-stack
return values most as well.

llvm-svn: 303308

											
										
										
											2017-05-18 05:56:25 +08:00
+								void SIFrameLowering::emitPrologue(MachineFunction &MF,
 								                                   MachineBasicBlock &MBB) const {
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								  if (FuncInfo->isEntryFunction()) {
-												AMDGPU: Start defining a calling convention

Partially implement callee-side for arguments and return values.
byval doesn't work properly, and most likely sret or other on-stack
return values most as well.

llvm-svn: 303308

											
										
										
											2017-05-18 05:56:25 +08:00
+								    emitEntryFunctionPrologue(MF, MBB);
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								    return;
 								  }
 								  const MachineFrameInfo &MFI = MF.getFrameInfo();
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								  const SIInstrInfo *TII = ST.getInstrInfo();
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
 								  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
 								  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								  LivePhysRegs LiveRegs;
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
 								  MachineBasicBlock::iterator MBBI = MBB.begin();
 								  DebugLoc DL;
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  bool HasFP = false;
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								  uint32_t NumBytes = MFI.getStackSize();
 								  uint32_t RoundedSize = NumBytes;
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  if (TRI.needsStackRealignment(MF)) {
 								    HasFP = true;
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								    const unsigned Alignment = MFI.getMaxAlignment();
 								    RoundedSize += Alignment;
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								    LiveRegs.init(TRI);
 								    LiveRegs.addLiveIns(MBB);
 								    unsigned ScratchSPReg
 								      = findScratchNonCalleeSaveRegister(MF, LiveRegs,
 								                                         AMDGPU::SReg_32_XM0RegClass);
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								    assert(ScratchSPReg != AMDGPU::NoRegister);
 								    // s_add_u32 tmp_reg, s32, NumBytes
 								    // s_and_b32 s32, tmp_reg, 0b111...0000
 								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
 								      .addReg(StackPtrReg)
 								      .addImm((Alignment - 1) * ST.getWavefrontSize())
 								      .setMIFlag(MachineInstr::FrameSetup);
 								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
 								      .addReg(ScratchSPReg, RegState::Kill)
 								      .addImm(-Alignment * ST.getWavefrontSize())
 								      .setMIFlag(MachineInstr::FrameSetup);
 								    FuncInfo->setIsStackRealigned(true);
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  } else if ((HasFP = hasFP(MF))) {
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								    // If we need a base pointer, set it up here. It's whatever the value of
 								    // the stack pointer is at this point. Any variable size objects will be
 								    // allocated after this, so we can still use the base pointer to reference
 								    // locals.
 								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
 								      .addReg(StackPtrReg)
 								      .setMIFlag(MachineInstr::FrameSetup);
 								  }
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  if (HasFP && RoundedSize != 0) {
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
 								      .addReg(StackPtrReg)
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								      .addImm(RoundedSize * ST.getWavefrontSize())
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								      .setMIFlag(MachineInstr::FrameSetup);
 								  }
-												AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it

llvm-svn: 309783

											
										
										
											2017-08-02 09:52:45 +08:00
-												AMDGPU: Don't enable all lanes with non-CSR VGPR spills

If the only VGPRs used for SGPR spilling were not CSRs, this was
enabling all laness and immediately restoring exec. This is the usual
situation in leaf functions.

llvm-svn: 361848

											
										
										
											2019-05-29 00:46:02 +08:00
+								  // To avoid clobbering VGPRs in lanes that weren't active on function entry,
 								  // turn on all lanes before doing the spill to memory.
 								  unsigned ScratchExecCopy = AMDGPU::NoRegister;
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
-												AMDGPU: Don't enable all lanes with non-CSR VGPR spills

If the only VGPRs used for SGPR spilling were not CSRs, this was
enabling all laness and immediately restoring exec. This is the usual
situation in leaf functions.

llvm-svn: 361848

											
										
										
											2019-05-29 00:46:02 +08:00
+								  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
 								         : FuncInfo->getSGPRSpillVGPRs()) {
 								    if (!Reg.FI.hasValue())
 								      continue;
 								    if (ScratchExecCopy == AMDGPU::NoRegister) {
 								      if (LiveRegs.empty()) {
 								        LiveRegs.init(TRI);
 								        LiveRegs.addLiveIns(MBB);
 								      }
 								      ScratchExecCopy
 								        = findScratchNonCalleeSaveRegister(MF, LiveRegs,
 								                                           AMDGPU::SReg_64_XEXECRegClass);
 								      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
 								              ScratchExecCopy)
 								        .addImm(-1);
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								    }
-												AMDGPU: Don't enable all lanes with non-CSR VGPR spills

If the only VGPRs used for SGPR spilling were not CSRs, this was
enabling all laness and immediately restoring exec. This is the usual
situation in leaf functions.

llvm-svn: 361848

											
										
										
											2019-05-29 00:46:02 +08:00
+								    TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
 								                             Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
 								                             &TII->getRegisterInfo());
 								  }
 								  if (ScratchExecCopy != AMDGPU::NoRegister) {
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								    // FIXME: Split block and make terminator.
 								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
 								      .addReg(ScratchExecCopy);
-												AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it

llvm-svn: 309783

											
										
										
											2017-08-02 09:52:45 +08:00
+								  }
-												AMDGPU: Start defining a calling convention

Partially implement callee-side for arguments and return values.
byval doesn't work properly, and most likely sret or other on-stack
return values most as well.

llvm-svn: 303308

											
										
										
											2017-05-18 05:56:25 +08:00
+								}
-												AMDGPU: Cleanup subtarget handling.

Split AMDGPUSubtarget into amdgcn/r600 specific subclasses.
This removes most of the static_casting of the basic codegen
classes everywhere, and tries to restrict the features
visible on the wrong target.

llvm-svn: 273652

											
										
										
											2016-06-24 14:30:11 +08:00
+								void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 								                                   MachineBasicBlock &MBB) const {
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 								  if (FuncInfo->isEntryFunction())
 								    return;
-												AMDGPU: Cleanup subtarget handling.

Split AMDGPUSubtarget into amdgcn/r600 specific subclasses.
This removes most of the static_casting of the basic codegen
classes everywhere, and tries to restrict the features
visible on the wrong target.

llvm-svn: 273652

											
										
										
											2016-06-24 14:30:11 +08:00
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-												AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it

llvm-svn: 309783

											
										
										
											2017-08-02 09:52:45 +08:00
+								  const SIInstrInfo *TII = ST.getInstrInfo();
 								  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								  DebugLoc DL;
-												AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it

llvm-svn: 309783

											
										
										
											2017-08-02 09:52:45 +08:00
-												AMDGPU: Don't enable all lanes with non-CSR VGPR spills

If the only VGPRs used for SGPR spilling were not CSRs, this was
enabling all laness and immediately restoring exec. This is the usual
situation in leaf functions.

llvm-svn: 361848

											
										
										
											2019-05-29 00:46:02 +08:00
+								  unsigned ScratchExecCopy = AMDGPU::NoRegister;
 								  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
 								         : FuncInfo->getSGPRSpillVGPRs()) {
 								    if (!Reg.FI.hasValue())
 								      continue;
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
-												AMDGPU: Don't enable all lanes with non-CSR VGPR spills

If the only VGPRs used for SGPR spilling were not CSRs, this was
enabling all laness and immediately restoring exec. This is the usual
situation in leaf functions.

llvm-svn: 361848

											
										
										
											2019-05-29 00:46:02 +08:00
+								    if (ScratchExecCopy == AMDGPU::NoRegister) {
 								      // See emitPrologue
 								      LivePhysRegs LiveRegs(*ST.getRegisterInfo());
 								      LiveRegs.addLiveIns(MBB);
 								      ScratchExecCopy
 								        = findScratchNonCalleeSaveRegister(MF, LiveRegs,
 								                                           AMDGPU::SReg_64_XEXECRegClass);
 								      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
 								        .addImm(-1);
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								    }
-												AMDGPU: Don't enable all lanes with non-CSR VGPR spills

If the only VGPRs used for SGPR spilling were not CSRs, this was
enabling all laness and immediately restoring exec. This is the usual
situation in leaf functions.

llvm-svn: 361848

											
										
										
											2019-05-29 00:46:02 +08:00
+								    TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
 								                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
 								                              &TII->getRegisterInfo());
 								  }
 								  if (ScratchExecCopy != AMDGPU::NoRegister) {
-												AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

llvm-svn: 361655

											
										
										
											2019-05-25 02:18:51 +08:00
+								    // FIXME: Split block and make terminator.
 								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
 								      .addReg(ScratchExecCopy);
-												AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it

llvm-svn: 309783

											
										
										
											2017-08-02 09:52:45 +08:00
+								  }
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  if (hasFP(MF)) {
 								    const MachineFrameInfo &MFI = MF.getFrameInfo();
 								    uint32_t NumBytes = MFI.getStackSize();
-												AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

llvm-svn: 328831

											
										
										
											2018-03-30 05:30:06 +08:00
+								    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
 								      NumBytes + MFI.getMaxAlignment() : NumBytes;
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								    const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
 								      .addReg(StackPtrReg)
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								      .addImm(RoundedSize * ST.getWavefrontSize())
 								      .setMIFlag(MachineInstr::FrameDestroy);
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								  }
-												AMDGPU: Cleanup subtarget handling.

Split AMDGPUSubtarget into amdgcn/r600 specific subclasses.
This removes most of the static_casting of the basic codegen
classes everywhere, and tries to restrict the features
visible on the wrong target.

llvm-svn: 273652

											
										
										
											2016-06-24 14:30:11 +08:00
+								}
-												AMDGPU: Don't add emergency stack slot if all spills are SGPR->VGPR

This should avoid reporting any stack needs to be allocated in the
case where no stack is truly used. An unused stack slot is still
left around in other cases where there are real stack objects
but no spilling occurs.

llvm-svn: 295891

											
										
										
											2017-02-23 06:23:32 +08:00
+								static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
 								  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
 								       I != E; ++I) {
 								    if (!MFI.isDeadObjectIndex(I))
 								      return false;
 								  }
 								  return true;
 								}
-												[AMDGPU] Split R600/SI getFrameIndexReference and emit stack object offsets for SI

Differential Revision: https://reviews.llvm.org/D29674

llvm-svn: 297499

											
										
										
											2017-03-11 03:39:07 +08:00
+								int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
 								                                            unsigned &FrameReg) const {
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								  const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
-												[AMDGPU] Split R600/SI getFrameIndexReference and emit stack object offsets for SI

Differential Revision: https://reviews.llvm.org/D29674

llvm-svn: 297499

											
										
										
											2017-03-11 03:39:07 +08:00
 								  FrameReg = RI->getFrameRegister(MF);
 								  return MF.getFrameInfo().getObjectOffset(FI);
 								}
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								void SIFrameLowering::processFunctionBeforeFrameFinalized(
 								  MachineFunction &MF,
 								  RegScavenger *RS) const {
-												MachineFunction: Return reference for getFrameInfo(); NFC

getFrameInfo() never returns nullptr so we should use a reference
instead of a pointer.

llvm-svn: 277017

											
										
										
											2016-07-29 02:40:00 +08:00
+								  MachineFrameInfo &MFI = MF.getFrameInfo();
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
-												MachineFunction: Return reference for getFrameInfo(); NFC

getFrameInfo() never returns nullptr so we should use a reference
instead of a pointer.

llvm-svn: 277017

											
										
										
											2016-07-29 02:40:00 +08:00
+								  if (!MFI.hasStackObjects())
-												AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

llvm-svn: 254329

											
										
										
											2015-12-01 05:15:53 +08:00
+								    return;
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-												AMDGPU: Don't add emergency stack slot if all spills are SGPR->VGPR

This should avoid reporting any stack needs to be allocated in the
case where no stack is truly used. An unused stack slot is still
left around in other cases where there are real stack objects
but no spilling occurs.

llvm-svn: 295891

											
										
										
											2017-02-23 06:23:32 +08:00
+								  const SIInstrInfo *TII = ST.getInstrInfo();
 								  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 								  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 								  bool AllSGPRSpilledToVGPRs = false;
 								  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
 								    AllSGPRSpilledToVGPRs = true;
 								    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
 								    // are spilled to VGPRs, in which case we can eliminate the stack usage.
 								    //
 								    // XXX - This operates under the assumption that only other SGPR spills are
 								    // users of the frame index. I'm not 100% sure this is correct. The
 								    // StackColoring pass has a comment saying a future improvement would be to
 								    // merging of allocas with spill slots, but for now according to
 								    // MachineFrameInfo isSpillSlot can't alias any other object.
 								    for (MachineBasicBlock &MBB : MF) {
 								      MachineBasicBlock::iterator Next;
 								      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
 								        MachineInstr &MI = *I;
 								        Next = std::next(I);
 								        if (TII->isSGPRSpill(MI)) {
 								          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
-												AMDGPU: Assign enum name to stack ID

Also assert that it is correct for SGPRs. There is currently a bug
where stack slot coloring replaces SGPR spill FIs with one with
the default ID, which results in a more confusing assert later
about a dead object.

llvm-svn: 330607

											
										
										
											2018-04-23 23:51:26 +08:00
+								          assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
-												AMDGPU: Don't add emergency stack slot if all spills are SGPR->VGPR

This should avoid reporting any stack needs to be allocated in the
case where no stack is truly used. An unused stack slot is still
left around in other cases where there are real stack objects
but no spilling occurs.

llvm-svn: 295891

											
										
										
											2017-02-23 06:23:32 +08:00
+								          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
 								            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
 								            (void)Spilled;
 								            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
 								          } else
 								            AllSGPRSpilledToVGPRs = false;
 								        }
 								      }
 								    }
 								  }
-												Enforce StackID definition in PEI

There are various places in LLVM where the definition of StackID is not
properly honoured, for example in PEI where objects with a StackID > 0 are
allocated on the default stack (StackID0). This patch enforces that PEI
only considers allocating objects to StackID 0.

Reviewers: arsenm, thegameg, MatzeB

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D60062

llvm-svn: 357460

											
										
										
											2019-04-02 17:46:52 +08:00
+								  FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
-												AMDGPU: Don't add emergency stack slot if all spills are SGPR->VGPR

This should avoid reporting any stack needs to be allocated in the
case where no stack is truly used. An unused stack slot is still
left around in other cases where there are real stack objects
but no spilling occurs.

llvm-svn: 295891

											
										
										
											2017-02-23 06:23:32 +08:00
+								  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
 								  // but currently hasNonSpillStackObjects is set only from source
 								  // allocas. Stack temps produced from legalization are not counted currently.
 								  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
 								      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
 								    assert(RS && "RegScavenger required if spilling");
-												AMDGPU: Don't fix emergency stack slot at offset 0

This forced the caller to be aware of this, which is an ugly ABI
feature.

Partially reverts r295877. The original reasons for doing this are
mostly fixed. Alloca is now in a non-0 address space, so it should be
OK to have 0 as a valid pointer. Since we treat the absolute address
as the pointer value, this part only really needed to apply to
kernels.

Since r357093, we avoid the need to increment/decrement the offset
register in more cases, and since r354816 the scavenger can fail
without spilling, so it's less critical that we try to avoid an offset
that fits in the MUBUF offset.

Restrict to callable functions for now to split this into 2 steps to
limit thte number of test updates and in case anything breaks.

llvm-svn: 362665

											
										
										
											2019-06-06 06:37:50 +08:00
+								    if (FuncInfo->isEntryFunction()) {
 								      int ScavengeFI = MFI.CreateFixedObject(
 								        TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
 								      RS->addScavengingFrameIndex(ScavengeFI);
 								    } else {
 								      int ScavengeFI = MFI.CreateStackObject(
 								        TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
 								        TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
 								        false);
 								      RS->addScavengingFrameIndex(ScavengeFI);
 								    }
-												AMDGPU: Always allocate emergency stack slot at offset 0

This allows us to ensure that 0 is never a valid pointer
to a user object, and ensures that the offset is always legal
without needing a register to access it. This comes at the cost
of usable offsets and wasted stack space.

llvm-svn: 295877

											
										
										
											2017-02-23 05:05:25 +08:00
+								  }
-												AMDGPU: Create emergency stack slots during frame lowering

Test has a bogus verifier error which will be fixed by later commits.

llvm-svn: 252327

											
										
										
											2015-11-07 02:17:45 +08:00
+								}
-												[AMDGPU] Emit debugger prologue and emit the rest of the debugger fields in the kernel code header

Debugger prologue is emitted if -mattr=+amdgpu-debugger-emit-prologue.

Debugger prologue writes work group IDs and work item IDs to scratch memory at fixed location in the following format:
  - offset 0: work group ID x
  - offset 4: work group ID y
  - offset 8: work group ID z
  - offset 16: work item ID x
  - offset 20: work item ID y
  - offset 24: work item ID z

Set
  - amd_kernel_code_t::debug_wavefront_private_segment_offset_sgpr to scratch wave offset reg
  - amd_kernel_code_t::debug_private_segment_buffer_sgpr to scratch rsrc reg
  - amd_kernel_code_t::is_debug_supported to true if all debugger features are enabled

Differential Revision: http://reviews.llvm.org/D20335

llvm-svn: 273769

											
										
										
											2016-06-25 11:11:28 +08:00
-												AMDGPU: Don't spill SP reg like a normal CSR

llvm-svn: 313217

											
										
										
											2017-09-14 07:47:01 +08:00
+								void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
 								                                           RegScavenger *RS) const {
 								  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 								  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 								  // The SP is specifically managed and we don't want extra spills of it.
 								  SavedRegs.reset(MFI->getStackPtrOffsetReg());
 								}
-												AMDGPU: Initial implementation of calls

Includes a hack to fix the type selected for
the GlobalAddress of the function, which will be
fixed by changing the default datalayout to use
generic pointers for 0.

llvm-svn: 309732

											
										
										
											2017-08-02 03:54:18 +08:00
+								MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
 								  MachineFunction &MF,
 								  MachineBasicBlock &MBB,
 								  MachineBasicBlock::iterator I) const {
 								  int64_t Amount = I->getOperand(0).getImm();
 								  if (Amount == 0)
 								    return MBB.erase(I);
-												AMDGPU: Refactor Subtarget classes

Summary:
This is a follow-up to r335942.
- Merge SISubtarget into AMDGPUSubtarget and rename to GCNSubtarget
- Rename AMDGPUCommonSubtarget to AMDGPUSubtarget
- Merge R600Subtarget::Generation and GCNSubtarget::Generation into
  AMDGPUSubtarget::Generation.

Reviewers: arsenm, jvesely

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D49037

llvm-svn: 336851

											
										
										
											2018-07-12 04:59:01 +08:00
+								  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-												AMDGPU: Initial implementation of calls

Includes a hack to fix the type selected for
the GlobalAddress of the function, which will be
fixed by changing the default datalayout to use
generic pointers for 0.

llvm-svn: 309732

											
										
										
											2017-08-02 03:54:18 +08:00
+								  const SIInstrInfo *TII = ST.getInstrInfo();
 								  const DebugLoc &DL = I->getDebugLoc();
 								  unsigned Opc = I->getOpcode();
 								  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
 								  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
 								  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 								  if (!TFI->hasReservedCallFrame(MF)) {
 								    unsigned Align = getStackAlignment();
 								    Amount = alignTo(Amount, Align);
 								    assert(isUInt<32>(Amount) && "exceeded stack address space size");
 								    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 								    unsigned SPReg = MFI->getStackPtrOffsetReg();
 								    unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
 								    BuildMI(MBB, I, DL, TII->get(Op), SPReg)
 								      .addReg(SPReg)
 								      .addImm(Amount * ST.getWavefrontSize());
 								  } else if (CalleePopAmount != 0) {
 								    llvm_unreachable("is this used?");
 								  }
 								  return MBB.erase(I);
 								}
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
 								  const MachineFrameInfo &MFI = MF.getFrameInfo();
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  if (MFI.hasCalls()) {
 								    // All offsets are unsigned, so need to be addressed in the same direction
 								    // as stack growth.
 								    if (MFI.getStackSize() != 0)
 								      return true;
 								    // For the entry point, the input wave scratch offset must be copied to the
 								    // API SP if there are calls.
 								    if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
 								      return true;
 								    // Retain behavior of always omitting the FP for leaf functions when
 								    // possible.
 								    if (MF.getTarget().Options.DisableFramePointerElim(MF))
 								      return true;
 								  }
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
-												AMDGPU: Invert frame index offset interpretation

Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.

Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.

The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.

Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.

Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.

llvm-svn: 362661

											
										
										
											2019-06-06 06:20:47 +08:00
+								  return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
 								    MFI.hasStackMap() || MFI.hasPatchPoint() ||
 								    MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF);
-												AMDGPU: Setup SP/FP in callee function prolog/epilog

llvm-svn: 306312

											
										
										
											2017-06-27 01:53:59 +08:00
+								}