[AMDGPU] Attempt to reschedule withou clustering

We want to have more load/store clustering but we also want to maintain low register pressure which are oposit targets. Allow scheduler to reschedule regions without mutations applied if we hit a register limit. Differential Revision: https://reviews.llvm.org/D73386
2020-01-23 16:18:16 -08:00 · 2020-01-23 16:18:16 -08:00 · 53eb0f8c07
parent 97711228fd
commit 53eb0f8c07
3 changed files with 99 additions and 18 deletions
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@ -316,13 +316,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
  ST(MF.getSubtarget<GCNSubtarget>()),
  MFI(*MF.getInfo<SIMachineFunctionInfo>()),
  StartingOccupancy(MFI.getOccupancy()),
-  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
+  MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {

  LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }

 void GCNScheduleDAGMILive::schedule() {
-  if (Stage == 0) {
+  if (Stage == Collect) {
    // Just record regions at the first pass.
    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
    return;
@ -348,6 +348,7 @@ void GCNScheduleDAGMILive::schedule() {

  ScheduleDAGMILive::schedule();
  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+  RescheduleRegions[RegionIdx] = false;

  if (!LIS)
    return;
@ -389,20 +390,28 @@ void GCNScheduleDAGMILive::schedule() {
                      << MinOccupancy << ".\n");
  }

+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+  if (PressureAfter.getVGPRNum() > MaxVGPRs ||
+      PressureAfter.getSGPRNum() > MaxSGPRs)
+    RescheduleRegions[RegionIdx] = true;
+
  if (WavesAfter >= MinOccupancy) {
-    unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
-    unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
-    if (WavesAfter > MFI.getMinWavesPerEU() ||
+    if (Stage == UnclusteredReschedule &&
+        !PressureAfter.less(ST, PressureBefore)) {
+      LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+    } else if (WavesAfter > MFI.getMinWavesPerEU() ||
        PressureAfter.less(ST, PressureBefore) ||
-        (TotalVGPRs >= PressureAfter.getVGPRNum() &&
-         TotalSGPRs >= PressureAfter.getSGPRNum())) {
+        !RescheduleRegions[RegionIdx]) {
      Pressure[RegionIdx] = PressureAfter;
      return;
+    } else {
+      LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
    }
-    LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
  }

  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RescheduleRegions[RegionIdx] = true;
  RegionEnd = RegionBegin;
  for (MachineInstr *MI : Unsched) {
    if (MI->isDebugInstr())
@ -532,33 +541,55 @@ void GCNScheduleDAGMILive::finalizeSchedule() {

  LiveIns.resize(Regions.size());
  Pressure.resize(Regions.size());
+  RescheduleRegions.resize(Regions.size());
+  RescheduleRegions.set();

  if (!Regions.empty())
    BBLiveInMap = getBBLiveInMap();

+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
  do {
    Stage++;
    RegionIdx = 0;
    MachineBasicBlock *MBB = nullptr;

-    if (Stage > 1) {
+    if (Stage > InitialSchedule) {
+      if (!LIS)
+        break;
+
      // Retry function scheduling if we found resulting occupancy and it is
      // lower than used for first pass scheduling. This will give more freedom
      // to schedule low register pressure blocks.
      // Code is partially copied from MachineSchedulerBase::scheduleRegions().

-      if (!LIS || StartingOccupancy <= MinOccupancy)
-        break;
+      if (Stage == UnclusteredReschedule) {
+        if (RescheduleRegions.none())
+          continue;
+        LLVM_DEBUG(dbgs() <<
+          "Retrying function scheduling without clustering.\n");
+      }

-      LLVM_DEBUG(
-          dbgs()
-          << "Retrying function scheduling with lowest recorded occupancy "
-          << MinOccupancy << ".\n");
+      if (Stage == ClusteredLowOccupancyReschedule) {
+        if (StartingOccupancy <= MinOccupancy)
+          break;

-      S.setTargetOccupancy(MinOccupancy);
+        LLVM_DEBUG(
+            dbgs()
+            << "Retrying function scheduling with lowest recorded occupancy "
+            << MinOccupancy << ".\n");
+
+        S.setTargetOccupancy(MinOccupancy);
+      }
    }

+    if (Stage == UnclusteredReschedule)
+      SavedMutations.swap(Mutations);
+
    for (auto Region : Regions) {
+      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+        continue;
+
      RegionBegin = Region.first;
      RegionEnd = Region.second;

@ -566,7 +597,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
        if (MBB) finishBlock();
        MBB = RegionBegin->getParent();
        startBlock(MBB);
-        if (Stage == 1)
+        if (Stage == InitialSchedule)
          computeBlockPressure(MBB);
      }

@ -594,5 +625,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
    }
    finishBlock();

-  } while (Stage < 2);
+    if (Stage == UnclusteredReschedule)
+      SavedMutations.swap(Mutations);
+  } while (Stage != LastStage);
 }
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@ -64,6 +64,14 @@ public:

 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {

+  enum : unsigned {
+    Collect,
+    InitialSchedule,
+    UnclusteredReschedule,
+    ClusteredLowOccupancyReschedule,
+    LastStage = ClusteredLowOccupancyReschedule
+  };
+
  const GCNSubtarget &ST;

  SIMachineFunctionInfo &MFI;
@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  SmallVector<std::pair<MachineBasicBlock::iterator,
                        MachineBasicBlock::iterator>, 32> Regions;

+  // Records if a region is not yet scheduled, or schedule has been reverted,
+  // or we generally desire to reschedule it.
+  BitVector RescheduleRegions;
+
  // Region live-in cache.
  SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;

--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Interleave loads and stores to fit into 9 VGPR limit.
+; This requires to avoid load/store clustering.
+
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: NumVgprs: {{[0-9]$}}
+; GCN: ScratchSize: 0{{$}}
+
+define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
+  %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
+  %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
+  %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
+  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
+  %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
+  store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
+  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
+  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
+  store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { "amdgpu-num-vgpr"="9" }