[AMDGPU] Avoid second rescheduling for some regions

If a region was not constrained by a high register pressure
and was not rescheduled without clustering we can skip
rescheduling it ClusteredLowOccupancyReschedule stage.

This improves scheduling speed by 25% on some kernels.

Differential Revision: https://reviews.llvm.org/D97506
This commit is contained in:
Stanislav Mekhanoshin 2021-02-25 15:03:34 -08:00
parent 635993f07b
commit 799c50fe93
2 changed files with 36 additions and 8 deletions

View File

@ -21,7 +21,7 @@ using namespace llvm;
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C) : const MachineSchedContext *C) :
GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
MF(nullptr) { } HasExcessPressure(false), MF(nullptr) { }
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG); GenericScheduler::initialize(DAG);
@ -104,11 +104,13 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// marked as RegExcess in tryCandidate() when they are compared with // marked as RegExcess in tryCandidate() when they are compared with
// instructions that increase the register pressure. // instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
HasExcessPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
} }
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
HasExcessPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
} }
@ -122,6 +124,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
if (SGPRDelta >= 0 || VGPRDelta >= 0) { if (SGPRDelta >= 0 || VGPRDelta >= 0) {
HasExcessPressure = true;
if (SGPRDelta > VGPRDelta) { if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax = Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32); PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@ -331,12 +334,17 @@ void GCNScheduleDAGMILive::schedule() {
} }
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
// Set HasClusteredNodes to true for late stages where we are not interested // Set HasClusteredNodes to true for late stages where we have already
// in it anymore. That way pickNode() will not scan SDep's when not needed. // collected it. That way pickNode() will not scan SDep's when not needed.
S.HasClusteredNodes = Stage >= UnclusteredReschedule; S.HasClusteredNodes = Stage > InitialSchedule;
S.HasExcessPressure = false;
ScheduleDAGMILive::schedule(); ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
RescheduleRegions[RegionIdx] = false; RescheduleRegions[RegionIdx] = false;
if (Stage == InitialSchedule && S.HasClusteredNodes)
RegionsWithClusters[RegionIdx] = true;
if (S.HasExcessPressure)
RegionsWithHighRP[RegionIdx] = true;
if (!LIS) if (!LIS)
return; return;
@ -381,8 +389,10 @@ void GCNScheduleDAGMILive::schedule() {
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
if (PressureAfter.getVGPRNum(false) > MaxVGPRs || if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
PressureAfter.getAGPRNum() > MaxVGPRs || PressureAfter.getAGPRNum() > MaxVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs) PressureAfter.getSGPRNum() > MaxSGPRs) {
RescheduleRegions[RegionIdx] = true; RescheduleRegions[RegionIdx] = true;
RegionsWithHighRP[RegionIdx] = true;
}
if (WavesAfter >= MinOccupancy) { if (WavesAfter >= MinOccupancy) {
if (Stage == UnclusteredReschedule && if (Stage == UnclusteredReschedule &&
@ -392,7 +402,8 @@ void GCNScheduleDAGMILive::schedule() {
PressureAfter.less(ST, PressureBefore) || PressureAfter.less(ST, PressureBefore) ||
!RescheduleRegions[RegionIdx]) { !RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter; Pressure[RegionIdx] = PressureAfter;
if (!S.HasClusteredNodes && (Stage + 1) == UnclusteredReschedule) if (!RegionsWithClusters[RegionIdx] &&
(Stage + 1) == UnclusteredReschedule)
RescheduleRegions[RegionIdx] = false; RescheduleRegions[RegionIdx] = false;
return; return;
} else { } else {
@ -401,7 +412,7 @@ void GCNScheduleDAGMILive::schedule() {
} }
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RescheduleRegions[RegionIdx] = S.HasClusteredNodes || RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
(Stage + 1) != UnclusteredReschedule; (Stage + 1) != UnclusteredReschedule;
RegionEnd = RegionBegin; RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) { for (MachineInstr *MI : Unsched) {
@ -535,7 +546,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size()); LiveIns.resize(Regions.size());
Pressure.resize(Regions.size()); Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size()); RescheduleRegions.resize(Regions.size());
RegionsWithClusters.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
RescheduleRegions.set(); RescheduleRegions.set();
RegionsWithClusters.reset();
RegionsWithHighRP.reset();
if (!Regions.empty()) if (!Regions.empty())
BBLiveInMap = getBBLiveInMap(); BBLiveInMap = getBBLiveInMap();
@ -580,7 +595,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
SavedMutations.swap(Mutations); SavedMutations.swap(Mutations);
for (auto Region : Regions) { for (auto Region : Regions) {
if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) { if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
(Stage == ClusteredLowOccupancyReschedule &&
!RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
++RegionIdx; ++RegionIdx;
continue; continue;
} }

View File

@ -54,6 +54,10 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
// before a region scheduling to know if the region had such clusters. // before a region scheduling to know if the region had such clusters.
bool HasClusteredNodes; bool HasClusteredNodes;
// schedule() have seen a an excess register pressure and had to track
// register pressure for actual scheduling heuristics.
bool HasExcessPressure;
MachineFunction *MF; MachineFunction *MF;
public: public:
@ -100,6 +104,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// or we generally desire to reschedule it. // or we generally desire to reschedule it.
BitVector RescheduleRegions; BitVector RescheduleRegions;
// Record regions which use clustered loads/stores.
BitVector RegionsWithClusters;
// Record regions with high register pressure.
BitVector RegionsWithHighRP;
// Region live-in cache. // Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns; SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;