MachineScheduler: Refactor setPolicy() to limit computing remaining latency

Summary:
Computing the remaining latency can be very expensive especially
on graphs of N nodes where the number of edges approaches N^2.

This reduces the compile time of a pathological case with the
AMDGPU backend from ~7.5 seconds to ~3 seconds.  This test case has
a basic block with 2655 stores, each with somewhere between 500
and 1500 successors and predecessors.

Reviewers: atrick, MatzeB, airlied, mareko

Reviewed By: mareko

Subscribers: tpr, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D50486

llvm-svn: 340346
This commit is contained in:
Tom Stellard 2018-08-21 21:48:43 +00:00
parent 6a2a5c99c7
commit ecd6aa5be2
2 changed files with 64 additions and 28 deletions

View File

@ -895,6 +895,10 @@ protected:
#ifndef NDEBUG
void traceCandidate(const SchedCandidate &Cand);
#endif
private:
bool shouldReduceLatency(const CandPolicy &Policy, SchedBoundary &CurrZone,
bool ComputeRemLatency, unsigned &RemLatency) const;
};
// Utility functions used by heuristics in tryCandidate().

View File

@ -2397,6 +2397,52 @@ initResourceDelta(const ScheduleDAGMI *DAG,
}
}
/// Compute remaining latency. We need this both to determine whether the
/// overall schedule has become latency-limited and whether the instructions
/// outside this zone are resource or latency limited.
///
/// The "dependent" latency is updated incrementally during scheduling as the
/// max height/depth of scheduled nodes minus the cycles since it was
/// scheduled:
/// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
///
/// The "independent" latency is the max ready queue depth:
/// ILat = max N.depth for N in Available|Pending
///
/// RemainingLatency is the greater of independent and dependent latency.
///
/// These computations are expensive, especially in DAGs with many edges, so
/// only do them if necessary.
static unsigned computeRemLatency(SchedBoundary &CurrZone) {
unsigned RemLatency = CurrZone.getDependentLatency();
RemLatency = std::max(RemLatency,
CurrZone.findMaxLatency(CurrZone.Available.elements()));
RemLatency = std::max(RemLatency,
CurrZone.findMaxLatency(CurrZone.Pending.elements()));
return RemLatency;
}
/// Returns true if the current cycle plus remaning latency is greater than
/// the cirtical path in the scheduling region.
bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
SchedBoundary &CurrZone,
bool ComputeRemLatency,
unsigned &RemLatency) const {
// The current cycle is already greater than the critical path, so we are
// already latnecy limited and don't need to compute the remaining latency.
if (CurrZone.getCurrCycle() > Rem.CriticalPath)
return true;
// If we haven't scheduled anything yet, then we aren't latency limited.
if (CurrZone.getCurrCycle() == 0)
return false;
if (ComputeRemLatency)
RemLatency = computeRemLatency(CurrZone);
return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath;
}
/// Set the CandPolicy given a scheduling zone given the current resources and
/// latencies inside and outside the zone.
void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
@ -2406,47 +2452,33 @@ void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
// inside and outside this zone. Potential stalls should be considered before
// following this policy.
// Compute remaining latency. We need this both to determine whether the
// overall schedule has become latency-limited and whether the instructions
// outside this zone are resource or latency limited.
//
// The "dependent" latency is updated incrementally during scheduling as the
// max height/depth of scheduled nodes minus the cycles since it was
// scheduled:
// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
//
// The "independent" latency is the max ready queue depth:
// ILat = max N.depth for N in Available|Pending
//
// RemainingLatency is the greater of independent and dependent latency.
unsigned RemLatency = CurrZone.getDependentLatency();
RemLatency = std::max(RemLatency,
CurrZone.findMaxLatency(CurrZone.Available.elements()));
RemLatency = std::max(RemLatency,
CurrZone.findMaxLatency(CurrZone.Pending.elements()));
// Compute the critical resource outside the zone.
unsigned OtherCritIdx = 0;
unsigned OtherCount =
OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
bool OtherResLimited = false;
if (SchedModel->hasInstrSchedModel())
unsigned RemLatency = 0;
bool RemLatencyComputed = false;
if (SchedModel->hasInstrSchedModel() && OtherCount != 0) {
RemLatency = computeRemLatency(CurrZone);
RemLatencyComputed = true;
OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(),
OtherCount, RemLatency);
}
// Schedule aggressively for latency in PostRA mode. We don't check for
// acyclic latency during PostRA, and highly out-of-order processors will
// skip PostRA scheduling.
if (!OtherResLimited) {
if (IsPostRA || (RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath)) {
if (!OtherResLimited &&
(IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed,
RemLatency))) {
Policy.ReduceLatency |= true;
LLVM_DEBUG(dbgs() << " " << CurrZone.Available.getName()
<< " RemainingLatency " << RemLatency << " + "
<< CurrZone.getCurrCycle() << "c > CritPath "
<< Rem.CriticalPath << "\n");
}
}
// If the same resource is limiting inside and outside the zone, do nothing.
if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
return;