From eba61071d7bac3933af19064a27b56c02f51be27 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Fri, 2 May 2014 15:41:42 +0000
Subject: [PATCH] R600/SI: Only create one instruction when spilling/restoring
 register v3

The register spiller assumes that only one new instruction is created
when spilling and restoring registers, so we need to emit pseudo
instructions for vector register spills and lower them after
register allocation.

v2:
  - Fix calculation of lane index
  - Extend VGPR liveness to end of program.

v3:
  - Use SIMM16 field of S_NOP to specify multiple NOPs.

https://bugs.freedesktop.org/show_bug.cgi?id=75005

llvm-svn: 207843
---
 llvm/lib/Target/R600/SIInstrInfo.cpp          | 165 +++++++++++++++---
 llvm/lib/Target/R600/SIInstrInfo.h            |   4 +
 llvm/lib/Target/R600/SIInstructions.td        |  23 ++-
 .../lib/Target/R600/SIMachineFunctionInfo.cpp |  63 +++++--
 llvm/lib/Target/R600/SIMachineFunctionInfo.h  |   7 +-
 llvm/lib/Target/R600/SIRegisterInfo.cpp       |   7 +
 llvm/lib/Target/R600/SIRegisterInfo.h         |   6 +
 7 files changed, 235 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/R600/SIInstrInfo.cpp b/llvm/lib/Target/R600/SIInstrInfo.cpp
index 5d08b91ea7bb..454b7c2d55f8 100644
--- a/llvm/lib/Target/R600/SIInstrInfo.cpp
+++ b/llvm/lib/Target/R600/SIInstrInfo.cpp
@@ -187,27 +187,45 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned KillFlag = isKill ? RegState::Kill : 0;
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.getNextLane(MRI);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
-            MFI->SpillTracker.LaneVGPR)
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
             .addReg(SrcReg, KillFlag)
             .addImm(Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+  } else if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for vector
+    // registers.
+    //
+    // Reserve a spot in the spill tracker for each sub-register of
+    // the vector register.
+    unsigned NumSubRegs = RC->getSize() / 4;
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
+                                                        NumSubRegs);
     MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
-                                    Lane);
-  } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg)
-              .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
-      storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i,
-                          &AMDGPU::SReg_32RegClass, TRI);
+                                    FirstLane);
+
+    unsigned Opcode;
+    switch (RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    default: llvm_unreachable("Cannot spill register class");
     }
+
+    BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+            .addReg(SrcReg)
+            .addImm(FrameIndex);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
   }
 }
 
@@ -216,32 +234,127 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        unsigned DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
-     SIMachineFunctionInfo::SpilledReg Spill =
+    SIMachineFunctionInfo::SpilledReg Spill =
         MFI->SpillTracker.getSpilledReg(FrameIndex);
     assert(Spill.VGPR);
     BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
             .addReg(Spill.VGPR)
             .addImm(Spill.Lane);
-  } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned Flags = RegState::Define;
-      if (i == 0) {
-        Flags |= RegState::Undef;
-      }
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i,
-                           &AMDGPU::SReg_32RegClass, TRI);
-      BuildMI(MBB, MI, DL, get(AMDGPU::COPY))
-              .addReg(DestReg, Flags, RI.getSubRegFromChannel(i))
-              .addReg(SubReg);
+    insertNOPs(MI, 3);
+  } else if (RI.isSGPRClass(RC)){
+    unsigned Opcode;
+    switch(RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+    default: llvm_unreachable("Cannot spill register class");
     }
+
+    SIMachineFunctionInfo::SpilledReg Spill =
+        MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+            .addReg(Spill.VGPR)
+            .addImm(FrameIndex);
+    insertNOPs(MI, 3);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
   }
 }
 
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+    return 2;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+                             int Count) const {
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  SIMachineFunctionInfo *MFI =
+      MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+  // SGPR register spill
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+    unsigned FrameIndex = MI->getOperand(2).getImm();
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
+                                            &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+              MI->getOperand(0).getReg())
+              .addReg(SubReg)
+              .addImm(Spill.Lane + i);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+
+  // SGPR register restore
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned FrameIndex = MI->getOperand(2).getImm();
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                   &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
+              .addReg(MI->getOperand(1).getReg())
+              .addImm(Spill.Lane + i);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+  }
+  return true;
+}
+
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
 
diff --git a/llvm/lib/Target/R600/SIInstrInfo.h b/llvm/lib/Target/R600/SIInstrInfo.h
index 63f1d7fdee89..d7992742216d 100644
--- a/llvm/lib/Target/R600/SIInstrInfo.h
+++ b/llvm/lib/Target/R600/SIInstrInfo.h
@@ -73,6 +73,8 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   unsigned commuteOpcode(unsigned Opcode) const;
 
   MachineInstr *commuteInstruction(MachineInstr *MI,
@@ -165,6 +167,8 @@ public:
 
   void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
               unsigned SavReg, unsigned IndexReg) const;
+
+  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td
index 27e7abe1a388..b93de36ddf86 100644
--- a/llvm/lib/Target/R600/SIInstructions.td
+++ b/llvm/lib/Target/R600/SIInstructions.td
@@ -369,7 +369,7 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 
 let Predicates = [isSI] in {
 
-//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
+def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>;
 
 let isTerminator = 1 in {
 
@@ -1574,6 +1574,27 @@ def V_SUB_F64 : InstSI <
 
 } // end usesCustomInserter
 
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+  def _SAVE : InstSI <
+    (outs VReg_32:$dst),
+    (ins sgpr_class:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+  def _RESTORE : InstSI <
+    (outs sgpr_class:$dst),
+    (ins VReg_32:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+}
+
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
 } // end IsCodeGenOnly, isPseudo
 
 def : Pat<
diff --git a/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp b/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
index ea04346e5097..af609958129c 100644
--- a/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,8 +10,11 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 
 #define MAX_LANES 64
 
@@ -26,21 +29,57 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PSInputAddr(0),
     SpillTracker() { }
 
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
-  return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-}
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
+  unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
 
-unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) {
-  if (!LaneVGPR) {
-    LaneVGPR = createLaneVGPR(MRI);
-  } else {
-    CurrentLane++;
-    if (CurrentLane == MAX_LANES) {
-      CurrentLane = 0;
-      LaneVGPR = createLaneVGPR(MRI);
+  // We need to add this register as live out for the function, in order to
+  // have the live range calculated directly.
+  //
+  // When register spilling begins, we have already calculated the live
+  // live intervals for all the registers.  Since we are spilling SGPRs to
+  // VGPRs, we need to update the Lane VGPR's live interval every time we
+  // spill or restore a register.
+  //
+  // Unfortunately, there is no good way to update the live interval as
+  // the TargetInstrInfo callbacks for spilling and restoring don't give
+  // us access to the live interval information.
+  //
+  // We are lucky, though, because the InlineSpiller calls
+  // LiveRangeEdit::calculateRegClassAndHint() which iterates through
+  // all the new register that have been created when restoring a register
+  // and calls LiveIntervals::getInterval(), which creates and computes
+  // the live interval for the newly created register.  However, once this
+  // live intervals is created, it doesn't change and since we usually reuse
+  // the Lane VGPR multiple times, this means any uses after the first aren't
+  // added to the live interval.
+  //
+  // To work around this, we add Lane VGPRs to the functions live out list,
+  // so that we can guarantee its live range will cover all of its uses.
+
+  for (MachineBasicBlock &MBB : *MF) {
+    if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+      MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
+      return VGPR;
     }
   }
-  return CurrentLane;
+  MF->getFunction()->getContext().emitError(
+      "Could not found S_ENGPGM instrtuction.");
+  return VGPR;
+}
+
+unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
+    MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
+  unsigned StartLane = CurrentLane;
+  CurrentLane += NumRegs;
+  if (!LaneVGPR) {
+    LaneVGPR = createLaneVGPR(MRI, MF);
+  } else {
+    if (CurrentLane >= MAX_LANES) {
+      StartLane = CurrentLane = 0;
+      LaneVGPR = createLaneVGPR(MRI, MF);
+    }
+  }
+  return StartLane;
 }
 
 void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
diff --git a/llvm/lib/Target/R600/SIMachineFunctionInfo.h b/llvm/lib/Target/R600/SIMachineFunctionInfo.h
index ef38270d51a1..96e619bde8d6 100644
--- a/llvm/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/R600/SIMachineFunctionInfo.h
@@ -43,7 +43,12 @@ public:
   public:
     unsigned LaneVGPR;
     RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
-    unsigned getNextLane(MachineRegisterInfo &MRI);
+    /// \p NumRegs The number of consecutive registers what need to be spilled.
+    ///            This function will ensure that all registers are stored in
+    ///            the same VGPR.
+    /// \returns The lane to be used for storing the first register.
+    unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
+                          unsigned NumRegs = 1);
     void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
     const SpilledReg& getSpilledReg(unsigned FrameIndex);
     bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
diff --git a/llvm/lib/Target/R600/SIRegisterInfo.cpp b/llvm/lib/Target/R600/SIRegisterInfo.cpp
index 5897fbca94c5..8dc9a05799c3 100644
--- a/llvm/lib/Target/R600/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/R600/SIRegisterInfo.cpp
@@ -129,3 +129,10 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     return &AMDGPU::VGPR_32RegClass;
   }
 }
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+                                          const TargetRegisterClass *SubRC,
+                                          unsigned Channel) const {
+  unsigned Index = getHWRegIndex(Reg);
+  return SubRC->getRegister(Index + Channel);
+}
diff --git a/llvm/lib/Target/R600/SIRegisterInfo.h b/llvm/lib/Target/R600/SIRegisterInfo.h
index 54717c184bc5..36b4fcd32a89 100644
--- a/llvm/lib/Target/R600/SIRegisterInfo.h
+++ b/llvm/lib/Target/R600/SIRegisterInfo.h
@@ -63,6 +63,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// be returned.
   const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
                                             unsigned SubIdx) const;
+
+  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+  ///            SubReg index.
+  /// \returns The sub-register of Reg that is in Channel.
+  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+                            unsigned Channel) const;
 };
 
 } // End namespace llvm