[AMDGPU] Iterative scheduling infrastructure + minimal registry scheduler

Differential revision: https://reviews.llvm.org/D31046 llvm-svn: 298368
2017-03-21 13:15:46 +00:00 · 2017-03-21 13:15:46 +00:00 · fd4c410f4d
parent 044e003203
commit fd4c410f4d
12 changed files with 1764 additions and 3 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@ -22,6 +22,7 @@
 #include "SIInstrInfo.h"
 #include "SIISelLowering.h"
 #include "SIFrameLowering.h"
+#include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@ -317,6 +318,11 @@ public:
  /// the given LDS memory size is the only constraint.
  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;

+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+    const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
+  }
+
  bool hasFP16Denormals() const {
    return FP64FP16Denormals;
  }
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -24,6 +24,7 @@
 #endif
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
+#include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
 #include "R600MachineScheduler.h"
 #include "SIMachineScheduler.h"
@ -155,6 +156,20 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
  return DAG;
 }

+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  auto DAG = new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+  return new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
 static MachineSchedRegistry
 R600SchedRegistry("r600", "Run R600's custom scheduler",
                   createR600MachineScheduler);
@ -168,6 +183,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
                             "Run GCN scheduler to maximize occupancy",
                             createGCNMaxOccupancyMachineScheduler);

+static MachineSchedRegistry
+IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
+  "Run GCN scheduler to maximize occupancy (experimental)",
+  createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+GCNMinRegSchedRegistry("gcn-minreg",
+  "Run GCN iterative scheduler for minimal register usage (experimental)",
+  createMinRegScheduler);
+
 static StringRef computeDataLayout(const Triple &TT) {
  if (TT.getArch() == Triple::r600) {
    // 32-bit pointers.
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -94,6 +94,9 @@ add_llvm_target(AMDGPUCodeGen
  SIShrinkInstructions.cpp
  SITypeRewriter.cpp
  SIWholeQuadMode.cpp
+  GCNIterativeScheduler.cpp
+  GCNMinRegStrategy.cpp
+  GCNRegPressure.cpp
  ${GLOBAL_ISEL_BUILD_FILES}
  )

--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@ -0,0 +1,528 @@
+//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNIterativeScheduler.h"
+#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace llvm {
+  std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+    const ScheduleDAG &DAG);
+}
+
+// shim accessors for different order containers
+static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
+  return MI;
+}
+static inline MachineInstr *getMachineInstr(const SUnit *SU) {
+  return SU->getInstr();
+}
+static inline MachineInstr *getMachineInstr(const SUnit &SU) {
+  return SU.getInstr();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void printRegion(raw_ostream &OS,
+                        MachineBasicBlock::iterator Begin,
+                        MachineBasicBlock::iterator End,
+                        const LiveIntervals *LIS,
+                        unsigned MaxInstNum =
+                          std::numeric_limits<unsigned>::max()) {
+  auto BB = Begin->getParent();
+  OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
+     << ' ' << BB->getName() << ":\n";
+  auto I = Begin;
+  MaxInstNum = std::max(MaxInstNum, 1u);
+  for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (I != End) {
+    OS << "\t...\n";
+    I = std::prev(End);
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (End != BB->end()) { // print boundary inst if present
+    OS << "----\n";
+    if (LIS) OS << LIS->getInstructionIndex(*End) << '\t';
+    OS << *End;
+  }
+}
+
+LLVM_DUMP_METHOD
+static void printLivenessInfo(raw_ostream &OS,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              const LiveIntervals *LIS) {
+  const auto BB = Begin->getParent();
+  const auto &MRI = BB->getParent()->getRegInfo();
+
+  const auto LiveIns = getLiveRegsBefore(*Begin, *LIS);
+  OS << "LIn RP: ";
+  getRegPressure(MRI, LiveIns).print(OS);
+
+  const auto BottomMI = End == BB->end() ? std::prev(End) : End;
+  const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS);
+  OS << "LOt RP: ";
+  getRegPressure(MRI, LiveOuts).print(OS);
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  for (const auto R : Regions) {
+    OS << "Region to schedule ";
+    printRegion(OS, R->Begin, R->End, LIS, 1);
+    printLivenessInfo(OS, R->Begin, R->End, LIS);
+    OS << "Max RP: ";
+    R->MaxPressure.print(OS, &ST);
+  }
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedResult(raw_ostream &OS,
+                                             const Region *R,
+                                             const GCNRegPressure &RP) const {
+  OS << "\nAfter scheduling ";
+  printRegion(OS, R->Begin, R->End, LIS);
+  printSchedRP(OS, R->MaxPressure, RP);
+  OS << '\n';
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
+                                         const GCNRegPressure &Before,
+                                         const GCNRegPressure &After) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  OS << "RP before: ";
+  Before.print(OS, &ST);
+  OS << "RP after:  ";
+  After.print(OS, &ST);
+}
+
+#endif
+
+// DAG builder helper
+class GCNIterativeScheduler::BuildDAG {
+  GCNIterativeScheduler &Sch;
+  SmallVector<SUnit*, 8> TopRoots;
+public:
+  BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
+    : Sch(_Sch) {
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+
+    Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
+                        /*TrackLaneMask*/true);
+    Sch.Topo.InitDAGTopologicalSorting();
+
+    SmallVector<SUnit*, 8> BotRoots;
+    Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
+  }
+  ~BuildDAG() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+  }
+  ArrayRef<const SUnit*> getTopRoots() const {
+    return TopRoots;
+  }
+};
+
+class GCNIterativeScheduler::OverrideLegacyStrategy {
+  GCNIterativeScheduler &Sch;
+  Region &Rgn;
+  std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
+  GCNRegPressure SaveMaxRP;
+public:
+  OverrideLegacyStrategy(Region &R,
+                         MachineSchedStrategy &OverrideStrategy,
+                         GCNIterativeScheduler &_Sch)
+    : Sch(_Sch)
+    , Rgn(R)
+    , SaveSchedImpl(std::move(_Sch.SchedImpl))
+    , SaveMaxRP(R.MaxPressure) {
+    Sch.SchedImpl.reset(&OverrideStrategy);
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+  }
+  ~OverrideLegacyStrategy() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+    Sch.SchedImpl.release();
+    Sch.SchedImpl = std::move(SaveSchedImpl);
+  }
+  void schedule() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    DEBUG(dbgs() << "\nScheduling ";
+      printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+    Sch.BaseClass::schedule();
+
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    Sch.RegionEnd = Rgn.End;
+    //assert(Rgn.End == Sch.RegionEnd);
+    Rgn.Begin = Sch.RegionBegin;
+    Rgn.MaxPressure.clear();
+  }
+  void restoreOrder() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    // DAG SUnits are stored using original region's order
+    // so just use SUnits as the restoring schedule
+    Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP);
+  }
+};
+
+// just a stub to make base class happy
+class SchedStrategyStub : public MachineSchedStrategy {
+public:
+  bool shouldTrackPressure() const override { return false; }
+  bool shouldTrackLaneMasks() const override { return false; }
+  void initialize(ScheduleDAGMI *DAG) override {}
+  SUnit *pickNode(bool &IsTopNode) override { return nullptr; }
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+  void releaseTopNode(SUnit *SU) override {}
+  void releaseBottomNode(SUnit *SU) override {}
+};
+
+GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
+                                             StrategyKind S)
+  : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+  , Context(C)
+  , Strategy(S)
+  , UPTracker(*LIS) {
+}
+
+// returns max pressure for a region
+GCNRegPressure
+GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End)
+  const {
+  // For the purpose of pressure tracking bottom inst of the region should
+  // be also processed. End is either BB end, BB terminator inst or sched
+  // boundary inst.
+  auto const BBEnd = Begin->getParent()->end();
+  auto const BottomMI = End == BBEnd ? std::prev(End) : End;
+
+  // scheduleRegions walks bottom to top, so its likely we just get next
+  // instruction to track
+  auto AfterBottomMI = std::next(BottomMI);
+  if (AfterBottomMI == BBEnd ||
+      &*AfterBottomMI != UPTracker.getLastTrackedMI()) {
+    UPTracker.reset(*BottomMI);
+  } else {
+    assert(UPTracker.isValid());
+  }
+
+  for (auto I = BottomMI; I != Begin; --I)
+    UPTracker.recede(*I);
+
+  UPTracker.recede(*Begin);
+
+  assert(UPTracker.isValid() ||
+         (dbgs() << "Tracked region ",
+          printRegion(dbgs(), Begin, End, LIS), false));
+  return UPTracker.moveMaxPressure();
+}
+
+// returns max pressure for a tentative schedule
+template <typename Range> GCNRegPressure
+GCNIterativeScheduler::getSchedulePressure(const Region &R,
+                                           Range &&Schedule) const {
+  auto const BBEnd = R.Begin->getParent()->end();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  if (R.End != BBEnd) {
+    // R.End points to the boundary instruction but the
+    // schedule doesn't include it
+    RPTracker.reset(*R.End);
+    RPTracker.recede(*R.End);
+  } else {
+    // R.End doesn't point to the boundary instruction
+    RPTracker.reset(*std::prev(BBEnd));
+  }
+  for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
+    RPTracker.recede(*getMachineInstr(*--I));
+  }
+  return RPTracker.moveMaxPressure();
+}
+
+void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
+                                        MachineBasicBlock::iterator Begin,
+                                        MachineBasicBlock::iterator End,
+                                        unsigned NumRegionInstrs) {
+  BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs);
+  if (NumRegionInstrs > 2) {
+    Regions.push_back(
+      new (Alloc.Allocate())
+      Region { Begin, End, NumRegionInstrs,
+               getRegionPressure(Begin, End), nullptr });
+  }
+}
+
+void GCNIterativeScheduler::schedule() { // overriden
+  // do nothing
+  DEBUG(
+    printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+    if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+      dbgs() << "Max RP: ";
+      Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
+    }
+    dbgs() << '\n';
+  );
+}
+
+void GCNIterativeScheduler::finalizeSchedule() { // overriden
+  if (Regions.empty())
+    return;
+  switch (Strategy) {
+  case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
+  case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
+  case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+  }
+}
+
+// Detach schedule from SUnits and interleave it with debug values.
+// Returned schedule becomes independent of DAG state.
+std::vector<MachineInstr*>
+GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
+  std::vector<MachineInstr*> Res;
+  Res.reserve(Schedule.size() * 2);
+
+  if (FirstDbgValue)
+    Res.push_back(FirstDbgValue);
+
+  const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end();
+  for (auto SU : Schedule) {
+    Res.push_back(SU->getInstr());
+    const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) {
+      return P.second == SU->getInstr();
+    });
+    if (D != DbgE)
+      Res.push_back(D->first);
+  }
+  return Res;
+}
+
+void GCNIterativeScheduler::setBestSchedule(Region &R,
+                                            ScheduleRef Schedule,
+                                            const GCNRegPressure &MaxRP) {
+  R.BestSchedule.reset(
+    new TentativeSchedule{ detachSchedule(Schedule), MaxRP });
+}
+
+void GCNIterativeScheduler::scheduleBest(Region &R) {
+  assert(R.BestSchedule.get() && "No schedule specified");
+  scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure);
+  R.BestSchedule.reset();
+}
+
+// minimal required region scheduler, works for ranges of SUnits*,
+// SUnits or MachineIntrs*
+template <typename Range>
+void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
+                                           const GCNRegPressure &MaxRP) {
+  assert(RegionBegin == R.Begin && RegionEnd == R.End);
+  assert(LIS != nullptr);
+#ifndef NDEBUG
+  const auto SchedMaxRP = getSchedulePressure(R, Schedule);
+#endif
+  auto BB = R.Begin->getParent();
+  auto Top = R.Begin;
+  for (const auto &I : Schedule) {
+    auto MI = getMachineInstr(I);
+    if (MI != &*Top) {
+      BB->remove(MI);
+      BB->insert(Top, MI);
+      if (!MI->isDebugValue())
+        LIS->handleMove(*MI, true);
+    }
+    if (!MI->isDebugValue()) {
+      // Reset read - undef flags and update them later.
+      for (auto &Op : MI->operands())
+        if (Op.isReg() && Op.isDef())
+          Op.setIsUndef(false);
+
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
+                                       /*IgnoreDead*/false);
+      // Adjust liveness and add missing dead+read-undef flags.
+      auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    }
+    Top = std::next(MI->getIterator());
+  }
+  RegionBegin = getMachineInstr(Schedule.front());
+
+  // Schedule consisting of MachineInstr* is considered 'detached'
+  // and already interleaved with debug values
+  if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
+    placeDebugValues();
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    //assert(R.End == RegionEnd);
+    RegionEnd = R.End;
+  }
+
+  R.Begin = RegionBegin;
+  R.MaxPressure = MaxRP;
+
+#ifndef NDEBUG
+  const auto RegionMaxRP = getRegionPressure(R);
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+#endif
+  assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
+  || (dbgs() << "Max RP mismatch!!!\n"
+                "RP for schedule (calculated): ",
+      SchedMaxRP.print(dbgs(), &ST),
+      dbgs() << "RP for schedule (reported): ",
+      MaxRP.print(dbgs(), &ST),
+      dbgs() << "RP after scheduling: ",
+      RegionMaxRP.print(dbgs(), &ST),
+      false));
+}
+
+// Sort recorded regions by pressure - highest at the front
+void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  std::sort(Regions.begin(), Regions.end(),
+    [&ST, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  });
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Legacy MaxOccupancy Strategy
+
+// Tries to increase occupancy applying minreg scheduler for a sequence of
+// most demanding regions. Obtained schedules are saved as BestSchedule for a
+// region.
+// TargetOcc is the best achievable occupancy for a kernel.
+// Returns better occupancy on success or current occupancy on fail.
+// BestSchedules aren't deleted on fail.
+unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
+  // TODO: assert Regions are sorted descending by pressure
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
+               << ", current = " << Occ << '\n');
+
+  auto NewOcc = TargetOcc;
+  for (auto R : Regions) {
+    if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+      break;
+
+    DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+          printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+    const auto MaxRP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+          printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+    if (NewOcc <= Occ)
+      break;
+
+    setBestSchedule(*R, MinSchedule, MaxRP);
+  }
+  DEBUG(dbgs() << "New occupancy = " << NewOcc
+               << ", prev occupancy = " << Occ << '\n');
+  return std::max(NewOcc, Occ);
+}
+
+void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
+  bool TryMaximizeOccupancy) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+
+  sortRegionsByPressure(TgtOcc);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+  if (TryMaximizeOccupancy && Occ < TgtOcc)
+    Occ = tryMaximizeOccupancy(TgtOcc);
+
+  // This is really weird but for some magic scheduling regions twice
+  // gives performance improvement
+  const int NumPasses = Occ < TgtOcc ? 2 : 1;
+
+  TgtOcc = std::min(Occ, TgtOcc);
+  DEBUG(dbgs() << "Scheduling using default scheduler, "
+                  "target occupancy = " << TgtOcc << '\n');
+  GCNMaxOccupancySchedStrategy LStrgy(Context);
+
+  for (int I = 0; I < NumPasses; ++I) {
+    // running first pass with TargetOccupancy = 0 mimics previous scheduling
+    // approach and is a performance magic
+    LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
+    for (auto R : Regions) {
+      OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
+
+      Ovr.schedule();
+      const auto RP = getRegionPressure(*R);
+      DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+      if (RP.getOccupancy(ST) < TgtOcc) {
+        DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+        if (R->BestSchedule.get() &&
+            R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+          DEBUG(dbgs() << ", scheduling minimal register\n");
+          scheduleBest(*R);
+        } else {
+          DEBUG(dbgs() << ", restoring\n");
+          Ovr.restoreOrder();
+          assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+        }
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Minimal Register Strategy
+
+void GCNIterativeScheduler::scheduleMinReg(bool force) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  sortRegionsByPressure(TgtOcc);
+
+  auto MaxPressure = Regions.front()->MaxPressure;
+  for (auto R : Regions) {
+    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+      break;
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+
+    const auto RP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+      dbgs() << "\nWarning: Pressure becomes worse after minreg!";
+      printSchedRP(dbgs(), R->MaxPressure, RP);
+    });
+
+    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+      break;
+
+    scheduleRegion(*R, MinSchedule, RP);
+    DEBUG(printSchedResult(dbgs(), R, RP));
+
+    MaxPressure = RP;
+  }
+}
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@ -0,0 +1,118 @@
+//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+
+#include "GCNRegPressure.h"
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class GCNIterativeScheduler : public ScheduleDAGMILive {
+  typedef ScheduleDAGMILive BaseClass;
+public:
+  enum StrategyKind {
+    SCHEDULE_MINREGONLY,
+    SCHEDULE_MINREGFORCED,
+    SCHEDULE_LEGACYMAXOCCUPANCY
+  };
+
+  GCNIterativeScheduler(MachineSchedContext *C,
+                        StrategyKind S);
+
+  void schedule() override;
+
+  void enterRegion(MachineBasicBlock *BB,
+                   MachineBasicBlock::iterator Begin,
+                   MachineBasicBlock::iterator End,
+                   unsigned RegionInstrs) override;
+
+  void finalizeSchedule() override;
+
+protected:
+
+  typedef ArrayRef<const SUnit*> ScheduleRef;
+
+  struct TentativeSchedule {
+    std::vector<MachineInstr*> Schedule;
+    GCNRegPressure MaxPressure;
+  };
+
+  struct Region {
+    // Fields except for BestSchedule are supposed to reflect current IR state
+    // `const` fields are to emphasize they shouldn't change for any schedule.
+    MachineBasicBlock::iterator Begin;
+    // End is either a boundary instruction or end of basic block
+    const MachineBasicBlock::iterator End;
+    const unsigned NumRegionInstrs;
+    GCNRegPressure MaxPressure;
+
+    // best schedule for the region so far (not scheduled yet)
+    std::unique_ptr<TentativeSchedule> BestSchedule;
+  };
+
+  SpecificBumpPtrAllocator<Region> Alloc;
+  std::vector<Region*> Regions;
+
+  MachineSchedContext *Context;
+  const StrategyKind Strategy;
+  mutable GCNUpwardRPTracker UPTracker;
+
+  class BuildDAG;
+  class OverrideLegacyStrategy;
+
+  template <typename Range>
+  GCNRegPressure getSchedulePressure(const Region &R,
+                                     Range &&Schedule) const;
+
+  GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End) const;
+
+  GCNRegPressure getRegionPressure(const Region &R) const {
+    return getRegionPressure(R.Begin, R.End);
+  }
+
+  void setBestSchedule(Region &R,
+                       ScheduleRef Schedule,
+                       const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  void scheduleBest(Region &R);
+
+  std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const;
+
+  void sortRegionsByPressure(unsigned TargetOcc);
+
+  template <typename Range>
+  void scheduleRegion(Region &R, Range &&Schedule,
+                      const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  unsigned tryMaximizeOccupancy(unsigned TargetOcc =
+                                std::numeric_limits<unsigned>::max());
+
+  void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
+  void scheduleMinReg(bool force = false);
+
+  void printRegions(raw_ostream &OS) const;
+  void printSchedResult(raw_ostream &OS,
+                        const Region *R,
+                        const GCNRegPressure &RP) const;
+  void printSchedRP(raw_ostream &OS,
+                    const GCNRegPressure &Before,
+                    const GCNRegPressure &After) const;
+};
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
--- a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@ -0,0 +1,266 @@
+//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+class GCNMinRegScheduler {
+  struct Candidate : ilist_node<Candidate> {
+    const SUnit *SU;
+    int Priority;
+
+    Candidate(const SUnit *SU_, int Priority_ = 0)
+      : SU(SU_), Priority(Priority_) {}
+  };
+
+  SpecificBumpPtrAllocator<Candidate> Alloc;
+  typedef simple_ilist<Candidate> Queue;
+  Queue RQ; // Ready queue
+
+  std::vector<unsigned> NumPreds;
+
+  bool isScheduled(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max();
+  }
+
+  void setIsScheduled(const SUnit *SU)  {
+    assert(!SU->isBoundaryNode());
+    NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max();
+  }
+
+  unsigned getNumPreds(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return NumPreds[SU->NodeNum];
+  }
+
+  unsigned decNumPreds(const SUnit *SU) {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return --NumPreds[SU->NodeNum];
+  }
+
+  void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits);
+
+  int getReadySuccessors(const SUnit *SU) const;
+  int getNotReadySuccessors(const SUnit *SU) const;
+
+  template <typename Calc>
+  unsigned findMax(unsigned Num, Calc C);
+
+  Candidate* pickCandidate();
+
+  void bumpPredsPriority(const SUnit *SchedSU, int Priority);
+  void releaseSuccessors(const SUnit* SU, int Priority);
+
+public:
+  std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+                                     const ScheduleDAG &DAG);
+};
+
+void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
+  NumPreds.resize(SUnits.size());
+  for (unsigned I = 0; I < SUnits.size(); ++I)
+    NumPreds[I] = SUnits[I].NumPredsLeft;
+}
+
+int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const {
+  unsigned NumSchedSuccs = 0;
+  for (auto SDep : SU->Succs) {
+    bool wouldBeScheduled = true;
+    for (auto PDep : SDep.getSUnit()->Preds) {
+      auto PSU = PDep.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SU && !isScheduled(PSU)) {
+        wouldBeScheduled = false;
+        break;
+      }
+    }
+    NumSchedSuccs += wouldBeScheduled ? 1 : 0;
+  }
+  return NumSchedSuccs;
+}
+
+int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
+  return SU->Succs.size() - getReadySuccessors(SU);
+}
+
+template <typename Calc>
+unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
+  assert(!RQ.empty() && Num <= RQ.size());
+  typedef decltype(C(*RQ.begin())) T;
+  T Max = std::numeric_limits<T>::min();
+  unsigned NumMax = 0;
+  for (auto I = RQ.begin(); Num; --Num) {
+    T Cur = C(*I);
+    if (Cur >= Max) {
+      if (Cur > Max) {
+        Max = Cur;
+        NumMax = 1;
+      } else
+        ++NumMax;
+      auto &Cand = *I++;
+      RQ.remove(Cand);
+      RQ.push_front(Cand);
+      continue;
+    }
+    ++I;
+  }
+  return NumMax;
+}
+
+GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
+  do {
+    unsigned Num = RQ.size();
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      int Res = getNotReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+                   << Res << " successors, metric = " << -Res << '\n');
+      return -Res;
+    });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting most producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      auto Res = getReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
+                   << Res << " successors, metric = " << Res << '\n');
+      return Res;
+    });
+    if (Num == 1) break;
+
+    Num = Num ? Num : RQ.size();
+    DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
+    assert(Num == 1);
+  } while (false);
+
+  return &RQ.front();
+}
+
+void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
+  SmallPtrSet<const SUnit*, 32> Set;
+  for (const auto &S : SchedSU->Succs) {
+    if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) ||
+        S.getKind() != SDep::Data)
+      continue;
+    for (const auto &P : S.getSUnit()->Preds) {
+      auto PSU = P.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SchedSU && !isScheduled(PSU)) {
+        Set.insert(PSU);
+      }
+    }
+  }
+  SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end());
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    assert(!SU->isBoundaryNode());
+    for (const auto &P : SU->Preds) {
+      if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) &&
+          Set.insert(P.getSUnit()).second)
+        Worklist.push_back(P.getSUnit());
+    }
+  }
+  DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+               << ")'s non-ready successors of " << Priority
+               << " priority in ready queue: ");
+  const auto SetEnd = Set.end();
+  for (auto &C : RQ) {
+    if (Set.find(C.SU) != SetEnd) {
+      C.Priority = Priority;
+      DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
+  for (const auto &S : SU->Succs) {
+    auto SuccSU = S.getSUnit();
+    if (S.isWeak())
+      continue;
+    assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0);
+    if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0)
+      RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority));
+  }
+}
+
+std::vector<const SUnit*>
+GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
+                             const ScheduleDAG &DAG) {
+  const auto &SUnits = DAG.SUnits;
+  std::vector<const SUnit*> Schedule;
+  Schedule.reserve(SUnits.size());
+
+  initNumPreds(SUnits);
+
+  int StepNo = 0;
+
+  for (auto SU : TopRoots) {
+    RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo));
+  }
+  releaseSuccessors(&DAG.EntrySU, StepNo);
+
+  while (!RQ.empty()) {
+    DEBUG(
+      dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
+                "Ready queue:";
+      for (auto &C : RQ)
+        dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+      dbgs() << '\n';
+    );
+
+    auto C = pickCandidate();
+    assert(C);
+    RQ.remove(*C);
+    auto SU = C->SU;
+    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+    releaseSuccessors(SU, StepNo);
+    Schedule.push_back(SU);
+    setIsScheduled(SU);
+
+    if (getReadySuccessors(SU) == 0)
+      bumpPredsPriority(SU, StepNo);
+
+    ++StepNo;
+  }
+  assert(SUnits.size() == Schedule.size());
+
+  return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+                                             const ScheduleDAG &DAG) {
+  GCNMinRegScheduler S;
+  return S.schedule(TopRoots, DAG);
+}
+}
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@ -0,0 +1,355 @@
+//===------------------------- GCNRegPressure.cpp - -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNRegPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void llvm::printLivesAt(SlotIndex SI,
+                        const LiveIntervals &LIS,
+                        const MachineRegisterInfo &MRI) {
+  dbgs() << "Live regs at " << SI << ": "
+         << *LIS.getInstructionFromIndex(SI);
+  unsigned Num = 0;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    const auto &LI = LIS.getInterval(Reg);
+    if (LI.hasSubRanges()) {
+      bool firstTime = true;
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) continue;
+        if (firstTime) {
+          dbgs() << "  " << PrintReg(Reg, MRI.getTargetRegisterInfo())
+                 << '\n';
+          firstTime = false;
+        }
+        dbgs() << "  " << S << '\n';
+        ++Num;
+      }
+    } else if (LI.liveAt(SI)) {
+      dbgs() << "  " << LI << '\n';
+      ++Num;
+    }
+  }
+  if (!Num) dbgs() << "  <none>\n";
+}
+
+static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+                    const GCNRPTracker::LiveRegSet &S2) {
+  if (S1.size() != S2.size())
+    return false;
+
+  for (const auto &P : S1) {
+    auto I = S2.find(P.first);
+    if (I == S2.end() || I->second != P.second)
+      return false;
+  }
+  return true;
+}
+
+static GCNRPTracker::LiveRegSet
+stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
+  GCNRPTracker::LiveRegSet Res;
+  for (const auto &P : LR) {
+    if (P.second.any())
+      Res.insert(P);
+  }
+  return Res;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRegPressure
+
+unsigned GCNRegPressure::getRegKind(unsigned Reg,
+                                    const MachineRegisterInfo &MRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const auto RC = MRI.getRegClass(Reg);
+  auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  return STI->isSGPRClass(RC) ?
+    (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
+    (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
+}
+
+void GCNRegPressure::inc(unsigned Reg,
+                         LaneBitmask PrevMask,
+                         LaneBitmask NewMask,
+                         const MachineRegisterInfo &MRI) {
+  if (NewMask == PrevMask)
+    return;
+
+  int Sign = 1;
+  if (NewMask < PrevMask) {
+    std::swap(NewMask, PrevMask);
+    Sign = -1;
+  }
+#ifndef NDEBUG
+  const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
+#endif
+  switch (auto Kind = getRegKind(Reg, MRI)) {
+  case SGPR32:
+  case VGPR32:
+    assert(PrevMask.none() && NewMask == MaxMask);
+    Value[Kind] += Sign;
+    break;
+
+  case SGPR_TUPLE:
+  case VGPR_TUPLE:
+    assert(NewMask < MaxMask || NewMask == MaxMask);
+    assert(PrevMask < NewMask);
+
+    Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+      Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
+
+    if (PrevMask.none()) {
+      assert(NewMask.any());
+      Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
+    }
+    break;
+
+  default: llvm_unreachable("Unknown register kind");
+  }
+}
+
+bool GCNRegPressure::less(const SISubtarget &ST,
+                          const GCNRegPressure& O,
+                          unsigned MaxOccupancy) const {
+  const auto SGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+  const auto VGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+  const auto OtherSGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+  const auto OtherVGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+
+  const auto Occ = std::min(SGPROcc, VGPROcc);
+  const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+  if (Occ != OtherOcc)
+    return Occ > OtherOcc;
+
+  bool SGPRImportant = SGPROcc < VGPROcc;
+  const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
+
+  // if both pressures disagree on what is more important compare vgprs
+  if (SGPRImportant != OtherSGPRImportant) {
+    SGPRImportant = false;
+  }
+
+  // compare large regs pressure
+  bool SGPRFirst = SGPRImportant;
+  for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
+    if (SGPRFirst) {
+      auto SW = getSGPRTuplesWeight();
+      auto OtherSW = O.getSGPRTuplesWeight();
+      if (SW != OtherSW)
+        return SW < OtherSW;
+    } else {
+      auto VW = getVGPRTuplesWeight();
+      auto OtherVW = O.getVGPRTuplesWeight();
+      if (VW != OtherVW)
+        return VW < OtherVW;
+    }
+  }
+  return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
+                         (getVGRPNum() < O.getVGRPNum());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+  OS << "VGPRs: " << getVGRPNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
+  OS << ", SGPRs: " << getSGRPNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+  OS << ", LVGPR WT: " << getVGPRTuplesWeight()
+     << ", LSGPR WT: " << getSGPRTuplesWeight();
+  if (ST) OS << " -> Occ: " << getOccupancy(*ST);
+  OS << '\n';
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
+LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
+                                  SlotIndex SI,
+                                  const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI) {
+  assert(!MRI.reg_nodbg_empty(Reg));
+  LaneBitmask LiveMask;
+  const auto &LI = LIS.getInterval(Reg);
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges())
+      if (S.liveAt(SI)) {
+        LiveMask |= S.LaneMask;
+        assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
+               LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
+      }
+  } else if (LI.liveAt(SI)) {
+    LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
+  }
+  return LiveMask;
+}
+
+GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
+                                           const LiveIntervals &LIS,
+                                           const MachineRegisterInfo &MRI) {
+  GCNRPTracker::LiveRegSet LiveRegs;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    auto Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
+    if (LiveMask.any())
+      LiveRegs[Reg] = LiveMask;
+  }
+  return LiveRegs;
+}
+
+void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  LiveRegs = getLiveRegsAfter(MI, LIS);
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
+LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+  assert(MO.isDef() && MO.isReg() &&
+    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0 ?
+    MRI->getMaxLaneMaskForVReg(MO.getReg()) :
+    MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+  assert(MO.isUse() && MO.isReg() &&
+         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  if (auto SubReg = MO.getSubReg())
+    return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+  auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
+  if (MaxMask.getAsInteger() == 1) // cannot have subregs
+    return MaxMask;
+
+  // For a tentative schedule LIS isn't updated yet but livemask should remain
+  // the same on any schedule. Subreg defs can be reordered but they all must
+  // dominate uses anyway.
+  auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+  return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
+}
+
+void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
+  assert(MRI && "call reset first");
+
+  LastTrackedMI = &MI;
+
+  if (MI.isDebugValue())
+    return;
+
+  // process all defs first to ensure early clobbers are handled correctly
+  // iterating over operands() to catch implicit defs
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    auto Reg = MO.getReg();
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask &= ~getDefRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  // then all uses
+  for (const auto &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    auto Reg = MO.getReg();
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask |= getUsedRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
+                           const GCNRPTracker::LiveRegSet &TrackedLR,
+                           const TargetRegisterInfo *TRI) {
+  for (auto const &P : TrackedLR) {
+    auto I = LISLR.find(P.first);
+    if (I == LISLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in LIS reported set\n";
+    }
+    else if (I->second != P.second) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+        << " masks doesn't match: LIS reported "
+        << PrintLaneMask(I->second)
+        << ", tracked "
+        << PrintLaneMask(P.second)
+        << '\n';
+    }
+  }
+  for (auto const &P : LISLR) {
+    auto I = TrackedLR.find(P.first);
+    if (I == TrackedLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in tracked set\n";
+    }
+  }
+}
+
+bool GCNUpwardRPTracker::isValid() const {
+  const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
+  const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
+  const auto TrackedLR = stripEmpty(LiveRegs);
+
+  if (!isEqual(LISLR, TrackedLR)) {
+    dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
+              " LIS reported livesets mismatch:\n";
+    printLivesAt(SI, LIS, *MRI);
+    reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo());
+    return false;
+  }
+
+  auto LISPressure = getRegPressure(*MRI, LISLR);
+  if (LISPressure != CurPressure) {
+    dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: ";
+    CurPressure.print(dbgs());
+    dbgs() << "LIS rpt: ";
+    LISPressure.print(dbgs());
+    return false;
+  }
+  return true;
+}
+
+#endif
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@ -0,0 +1,170 @@
+//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+
+#include "AMDGPUSubtarget.h"
+
+#include <limits>
+
+namespace llvm {
+
+struct GCNRegPressure {
+  enum RegKind {
+    SGPR32,
+    SGPR_TUPLE,
+    VGPR32,
+    VGPR_TUPLE,
+    TOTAL_KINDS
+  };
+
+  GCNRegPressure() {
+    clear();
+  }
+
+  bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+
+  void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+
+  unsigned getSGRPNum() const { return Value[SGPR32]; }
+  unsigned getVGRPNum() const { return Value[VGPR32]; }
+
+  unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+  unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
+
+  unsigned getOccupancy(const SISubtarget &ST) const {
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
+                    ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+  }
+
+  void inc(unsigned Reg,
+           LaneBitmask PrevMask,
+           LaneBitmask NewMask,
+           const MachineRegisterInfo &MRI);
+
+  bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+    return getOccupancy(ST) > O.getOccupancy(ST);
+  }
+
+  bool less(const SISubtarget &ST, const GCNRegPressure& O,
+    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+
+  bool operator==(const GCNRegPressure &O) const {
+    return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
+  }
+
+  bool operator!=(const GCNRegPressure &O) const {
+    return !(*this == O);
+  }
+
+  void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
+  void dump() const { print(dbgs()); }
+
+private:
+  unsigned Value[TOTAL_KINDS];
+
+  static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+
+  friend GCNRegPressure max(const GCNRegPressure &P1,
+                            const GCNRegPressure &P2);
+};
+
+inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
+  GCNRegPressure Res;
+  for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I)
+    Res.Value[I] = std::max(P1.Value[I], P2.Value[I]);
+  return Res;
+}
+
+class GCNRPTracker {
+public:
+  typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
+
+protected:
+  LiveRegSet LiveRegs;
+  GCNRegPressure CurPressure, MaxPressure;
+  const MachineInstr *LastTrackedMI = nullptr;
+  mutable const MachineRegisterInfo *MRI = nullptr;
+  GCNRPTracker() {}
+public:
+  // live regs for the current state
+  const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
+  const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+
+  // returns MaxPressure, resetting it
+  decltype(MaxPressure) moveMaxPressure() {
+    auto Res = MaxPressure;
+    MaxPressure.clear();
+    return Res;
+  }
+  decltype(LiveRegs) moveLiveRegs() {
+    return std::move(LiveRegs);
+  }
+};
+
+class GCNUpwardRPTracker : public GCNRPTracker {
+  const LiveIntervals &LIS;
+  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
+public:
+  GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  // reset tracker to the point just below MI
+  // filling live regs upon this point using LIS
+  void reset(const MachineInstr &MI);
+
+  // move to the state just above the MI
+  void recede(const MachineInstr &MI);
+
+  // checks whether the tracker's state after receding MI corresponds
+  // to reported by LIS
+  bool isValid() const;
+};
+
+LaneBitmask getLiveLaneMask(unsigned Reg,
+                            SlotIndex SI,
+                            const LiveIntervals &LIS,
+                            const MachineRegisterInfo &MRI);
+
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
+                                     const LiveIntervals &LIS,
+                                     const MachineRegisterInfo &MRI);
+
+inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
+                                                 const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
+                                                  const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+template <typename Range>
+GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
+                              Range &&LiveRegs) {
+  GCNRegPressure Res;
+  for (const auto &RM : LiveRegs)
+    Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
+  return Res;
+}
+
+void printLivesAt(SlotIndex SI,
+                  const LiveIntervals &LIS,
+                  const MachineRegisterInfo &MRI);
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@ -45,8 +45,6 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {

  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);

-  if (MF != &DAG->MF)
-    TargetOccupancy = 0;
  MF = &DAG->MF;

  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
@ -531,7 +529,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {

  Stage++;
  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  S.TargetOccupancy = MinOccupancy;
+  S.setTargetOccupancy(MinOccupancy);

  MachineBasicBlock *MBB = nullptr;
  for (auto Region : Regions) {
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@ -55,6 +55,8 @@ public:
  SUnit *pickNode(bool &IsTopNode) override;

  void initialize(ScheduleDAGMI *DAG) override;
+
+  void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
 };

 class GCNScheduleDAGMILive : public ScheduleDAGMILive {
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@ -1,4 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s

 ; We expect a two digit VGPR usage here, not a three digit.
 ; CHECK: NumVgprs: {{[0-9][0-9]$}}
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+
+; SI: NumSgprs: {{[1-9]$}}
+; SI: NumVgprs: {{[1-9]$}}
+
+; stores may alias loads
+; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumVgprs: {{[1-3][0-9]$}}
+
+define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+bb:
+  %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
+  %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
+  %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508
+  %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772
+  %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020
+  %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276
+  %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540
+  %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788
+  %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044
+  %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308
+  %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556
+  %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812
+  %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076
+  %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324
+  %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580
+  %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844
+  %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092
+  %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348
+  %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612
+  %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860
+  %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116
+  %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380
+  %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628
+  %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884
+  %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148
+  %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396
+  %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652
+  %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916
+  %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164
+  %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420
+  %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684
+  %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932
+  %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188
+  %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452
+  %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700
+  %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956
+  %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220
+  %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468
+  %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724
+  %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988
+  %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236
+  %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492
+  %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756
+  %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004
+  %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260
+  %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524
+  %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772
+  %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028
+  %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292
+  %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540
+  %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796
+  %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060
+  %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308
+  %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564
+  %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828
+  %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076
+  %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332
+  %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596
+  %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844
+  %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100
+  %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364
+  %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612
+  %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868
+  %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132
+  %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380
+  %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636
+  %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900
+  %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148
+  %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404
+  %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668
+  %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916
+  %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172
+  %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436
+  %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684
+  %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940
+  %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204
+  %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452
+  %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708
+  %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972
+  %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220
+  %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476
+  %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740
+  %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988
+  %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244
+  %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508
+  %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756
+  %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012
+  %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276
+  %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524
+  %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780
+  %a.0 = load float, float addrspace(3)* %adr.a.0, align 4
+  %b.0 = load float, float addrspace(3)* %adr.b.0, align 4
+  %c.0 = load float, float addrspace(3)* %adr.c.0, align 4
+  %a.1 = load float, float addrspace(3)* %adr.a.1, align 4
+  %b.1 = load float, float addrspace(3)* %adr.b.1, align 4
+  %c.1 = load float, float addrspace(3)* %adr.c.1, align 4
+  %a.2 = load float, float addrspace(3)* %adr.a.2, align 4
+  %b.2 = load float, float addrspace(3)* %adr.b.2, align 4
+  %c.2 = load float, float addrspace(3)* %adr.c.2, align 4
+  %a.3 = load float, float addrspace(3)* %adr.a.3, align 4
+  %b.3 = load float, float addrspace(3)* %adr.b.3, align 4
+  %c.3 = load float, float addrspace(3)* %adr.c.3, align 4
+  %a.4 = load float, float addrspace(3)* %adr.a.4, align 4
+  %b.4 = load float, float addrspace(3)* %adr.b.4, align 4
+  %c.4 = load float, float addrspace(3)* %adr.c.4, align 4
+  %a.5 = load float, float addrspace(3)* %adr.a.5, align 4
+  %b.5 = load float, float addrspace(3)* %adr.b.5, align 4
+  %c.5 = load float, float addrspace(3)* %adr.c.5, align 4
+  %a.6 = load float, float addrspace(3)* %adr.a.6, align 4
+  %b.6 = load float, float addrspace(3)* %adr.b.6, align 4
+  %c.6 = load float, float addrspace(3)* %adr.c.6, align 4
+  %a.7 = load float, float addrspace(3)* %adr.a.7, align 4
+  %b.7 = load float, float addrspace(3)* %adr.b.7, align 4
+  %c.7 = load float, float addrspace(3)* %adr.c.7, align 4
+  %a.8 = load float, float addrspace(3)* %adr.a.8, align 4
+  %b.8 = load float, float addrspace(3)* %adr.b.8, align 4
+  %c.8 = load float, float addrspace(3)* %adr.c.8, align 4
+  %a.9 = load float, float addrspace(3)* %adr.a.9, align 4
+  %b.9 = load float, float addrspace(3)* %adr.b.9, align 4
+  %c.9 = load float, float addrspace(3)* %adr.c.9, align 4
+  %a.10 = load float, float addrspace(3)* %adr.a.10, align 4
+  %b.10 = load float, float addrspace(3)* %adr.b.10, align 4
+  %c.10 = load float, float addrspace(3)* %adr.c.10, align 4
+  %a.11 = load float, float addrspace(3)* %adr.a.11, align 4
+  %b.11 = load float, float addrspace(3)* %adr.b.11, align 4
+  %c.11 = load float, float addrspace(3)* %adr.c.11, align 4
+  %a.12 = load float, float addrspace(3)* %adr.a.12, align 4
+  %b.12 = load float, float addrspace(3)* %adr.b.12, align 4
+  %c.12 = load float, float addrspace(3)* %adr.c.12, align 4
+  %a.13 = load float, float addrspace(3)* %adr.a.13, align 4
+  %b.13 = load float, float addrspace(3)* %adr.b.13, align 4
+  %c.13 = load float, float addrspace(3)* %adr.c.13, align 4
+  %a.14 = load float, float addrspace(3)* %adr.a.14, align 4
+  %b.14 = load float, float addrspace(3)* %adr.b.14, align 4
+  %c.14 = load float, float addrspace(3)* %adr.c.14, align 4
+  %a.15 = load float, float addrspace(3)* %adr.a.15, align 4
+  %b.15 = load float, float addrspace(3)* %adr.b.15, align 4
+  %c.15 = load float, float addrspace(3)* %adr.c.15, align 4
+  %a.16 = load float, float addrspace(3)* %adr.a.16, align 4
+  %b.16 = load float, float addrspace(3)* %adr.b.16, align 4
+  %c.16 = load float, float addrspace(3)* %adr.c.16, align 4
+  %a.17 = load float, float addrspace(3)* %adr.a.17, align 4
+  %b.17 = load float, float addrspace(3)* %adr.b.17, align 4
+  %c.17 = load float, float addrspace(3)* %adr.c.17, align 4
+  %a.18 = load float, float addrspace(3)* %adr.a.18, align 4
+  %b.18 = load float, float addrspace(3)* %adr.b.18, align 4
+  %c.18 = load float, float addrspace(3)* %adr.c.18, align 4
+  %a.19 = load float, float addrspace(3)* %adr.a.19, align 4
+  %b.19 = load float, float addrspace(3)* %adr.b.19, align 4
+  %c.19 = load float, float addrspace(3)* %adr.c.19, align 4
+  %a.20 = load float, float addrspace(3)* %adr.a.20, align 4
+  %b.20 = load float, float addrspace(3)* %adr.b.20, align 4
+  %c.20 = load float, float addrspace(3)* %adr.c.20, align 4
+  %a.21 = load float, float addrspace(3)* %adr.a.21, align 4
+  %b.21 = load float, float addrspace(3)* %adr.b.21, align 4
+  %c.21 = load float, float addrspace(3)* %adr.c.21, align 4
+  %a.22 = load float, float addrspace(3)* %adr.a.22, align 4
+  %b.22 = load float, float addrspace(3)* %adr.b.22, align 4
+  %c.22 = load float, float addrspace(3)* %adr.c.22, align 4
+  %a.23 = load float, float addrspace(3)* %adr.a.23, align 4
+  %b.23 = load float, float addrspace(3)* %adr.b.23, align 4
+  %c.23 = load float, float addrspace(3)* %adr.c.23, align 4
+  %a.24 = load float, float addrspace(3)* %adr.a.24, align 4
+  %b.24 = load float, float addrspace(3)* %adr.b.24, align 4
+  %c.24 = load float, float addrspace(3)* %adr.c.24, align 4
+  %a.25 = load float, float addrspace(3)* %adr.a.25, align 4
+  %b.25 = load float, float addrspace(3)* %adr.b.25, align 4
+  %c.25 = load float, float addrspace(3)* %adr.c.25, align 4
+  %a.26 = load float, float addrspace(3)* %adr.a.26, align 4
+  %b.26 = load float, float addrspace(3)* %adr.b.26, align 4
+  %c.26 = load float, float addrspace(3)* %adr.c.26, align 4
+  %a.27 = load float, float addrspace(3)* %adr.a.27, align 4
+  %b.27 = load float, float addrspace(3)* %adr.b.27, align 4
+  %c.27 = load float, float addrspace(3)* %adr.c.27, align 4
+  %a.28 = load float, float addrspace(3)* %adr.a.28, align 4
+  %b.28 = load float, float addrspace(3)* %adr.b.28, align 4
+  %c.28 = load float, float addrspace(3)* %adr.c.28, align 4
+  %a.29 = load float, float addrspace(3)* %adr.a.29, align 4
+  %b.29 = load float, float addrspace(3)* %adr.b.29, align 4
+  %c.29 = load float, float addrspace(3)* %adr.c.29, align 4
+  %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0)
+  %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1)
+  %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2)
+  %res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3)
+  %res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4)
+  %res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5)
+  %res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6)
+  %res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7)
+  %res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8)
+  %res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9)
+  %res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10)
+  %res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11)
+  %res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12)
+  %res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13)
+  %res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14)
+  %res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15)
+  %res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16)
+  %res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17)
+  %res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18)
+  %res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19)
+  %res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20)
+  %res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21)
+  %res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22)
+  %res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23)
+  %res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24)
+  %res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25)
+  %res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26)
+  %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27)
+  %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28)
+  %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29)
+  %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0
+  %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2
+  %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4
+  %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6
+  %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8
+  %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10
+  %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12
+  %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14
+  %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16
+  %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18
+  %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20
+  %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22
+  %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24
+  %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26
+  %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28
+  %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30
+  %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32
+  %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34
+  %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36
+  %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38
+  %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40
+  %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42
+  %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44
+  %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46
+  %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48
+  %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50
+  %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52
+  %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54
+  %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56
+  %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58
+  store float %res.0, float addrspace(1)* %adr.res.0, align 4
+  store float %res.1, float addrspace(1)* %adr.res.1, align 4
+  store float %res.2, float addrspace(1)* %adr.res.2, align 4
+  store float %res.3, float addrspace(1)* %adr.res.3, align 4
+  store float %res.4, float addrspace(1)* %adr.res.4, align 4
+  store float %res.5, float addrspace(1)* %adr.res.5, align 4
+  store float %res.6, float addrspace(1)* %adr.res.6, align 4
+  store float %res.7, float addrspace(1)* %adr.res.7, align 4
+  store float %res.8, float addrspace(1)* %adr.res.8, align 4
+  store float %res.9, float addrspace(1)* %adr.res.9, align 4
+  store float %res.10, float addrspace(1)* %adr.res.10, align 4
+  store float %res.11, float addrspace(1)* %adr.res.11, align 4
+  store float %res.12, float addrspace(1)* %adr.res.12, align 4
+  store float %res.13, float addrspace(1)* %adr.res.13, align 4
+  store float %res.14, float addrspace(1)* %adr.res.14, align 4
+  store float %res.15, float addrspace(1)* %adr.res.15, align 4
+  store float %res.16, float addrspace(1)* %adr.res.16, align 4
+  store float %res.17, float addrspace(1)* %adr.res.17, align 4
+  store float %res.18, float addrspace(1)* %adr.res.18, align 4
+  store float %res.19, float addrspace(1)* %adr.res.19, align 4
+  store float %res.20, float addrspace(1)* %adr.res.20, align 4
+  store float %res.21, float addrspace(1)* %adr.res.21, align 4
+  store float %res.22, float addrspace(1)* %adr.res.22, align 4
+  store float %res.23, float addrspace(1)* %adr.res.23, align 4
+  store float %res.24, float addrspace(1)* %adr.res.24, align 4
+  store float %res.25, float addrspace(1)* %adr.res.25, align 4
+  store float %res.26, float addrspace(1)* %adr.res.26, align 4
+  store float %res.27, float addrspace(1)* %adr.res.27, align 4
+  store float %res.28, float addrspace(1)* %adr.res.28, align 4
+  store float %res.29, float addrspace(1)* %adr.res.29, align 4
+  ret void
+}
+declare float @llvm.fmuladd.f32(float, float, float) #0
+attributes #0 = { nounwind readnone }