forked from OSchip/llvm-project
misched: Target-independent support for load/store clustering.
This infrastructure is generally useful for any target that wants to strongly prefer two instructions to be adjacent after scheduling. A following checkin will add target-specific hooks with unit tests. Then this feature will be enabled by default with misched. llvm-svn: 167742
This commit is contained in:
parent
7ea7397ebc
commit
a7714a0ff9
|
@ -202,6 +202,10 @@ protected:
|
|||
RegisterClassInfo *RegClassInfo;
|
||||
MachineSchedStrategy *SchedImpl;
|
||||
|
||||
/// Topo - A topological ordering for SUnits which permits fast IsReachable
|
||||
/// and similar queries.
|
||||
ScheduleDAGTopologicalSort Topo;
|
||||
|
||||
/// Ordered list of DAG postprocessing steps.
|
||||
std::vector<ScheduleDAGMutation*> Mutations;
|
||||
|
||||
|
@ -226,6 +230,10 @@ protected:
|
|||
IntervalPressure BotPressure;
|
||||
RegPressureTracker BotRPTracker;
|
||||
|
||||
/// Record the next node in a scheduled cluster.
|
||||
const SUnit *NextClusterPred;
|
||||
const SUnit *NextClusterSucc;
|
||||
|
||||
#ifndef NDEBUG
|
||||
/// The number of instructions scheduled so far. Used to cut off the
|
||||
/// scheduler at the point determined by misched-cutoff.
|
||||
|
@ -236,24 +244,35 @@ public:
|
|||
ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
|
||||
ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
|
||||
AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
|
||||
RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
|
||||
CurrentBottom(), BotRPTracker(BotPressure) {
|
||||
Topo(SUnits, &ExitSU), RPTracker(RegPressure), CurrentTop(),
|
||||
TopRPTracker(TopPressure), CurrentBottom(), BotRPTracker(BotPressure),
|
||||
NextClusterPred(NULL), NextClusterSucc(NULL) {
|
||||
#ifndef NDEBUG
|
||||
NumInstrsScheduled = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual ~ScheduleDAGMI() {
|
||||
DeleteContainerPointers(Mutations);
|
||||
delete SchedImpl;
|
||||
}
|
||||
|
||||
/// Add a postprocessing step to the DAG builder.
|
||||
/// Mutations are applied in the order that they are added after normal DAG
|
||||
/// building and before MachineSchedStrategy initialization.
|
||||
///
|
||||
/// ScheduleDAGMI takes ownership of the Mutation object.
|
||||
void addMutation(ScheduleDAGMutation *Mutation) {
|
||||
Mutations.push_back(Mutation);
|
||||
}
|
||||
|
||||
/// \brief Add a DAG edge to the given SU with the given predecessor
|
||||
/// dependence data.
|
||||
///
|
||||
/// \returns true if the edge may be added without creating a cycle OR if an
|
||||
/// equivalent edge already existed (false indicates failure).
|
||||
bool addEdge(SUnit *SuccSU, const SDep &PredDep);
|
||||
|
||||
MachineBasicBlock::iterator top() const { return CurrentTop; }
|
||||
MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
|
||||
|
||||
|
@ -285,6 +304,10 @@ public:
|
|||
return RegionCriticalPSets;
|
||||
}
|
||||
|
||||
const SUnit *getNextClusterPred() const { return NextClusterPred; }
|
||||
|
||||
const SUnit *getNextClusterSucc() const { return NextClusterSucc; }
|
||||
|
||||
protected:
|
||||
// Top-Level entry points for the schedule() driver...
|
||||
|
||||
|
|
|
@ -621,6 +621,19 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
/// \brief Get the base register and byte offset of a load/store instr.
|
||||
virtual bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
|
||||
unsigned &BaseReg, unsigned &Offset,
|
||||
const TargetRegisterInfo *TRI) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool shouldScheduleLoadsNear(MachineInstr *FirstLdSt,
|
||||
MachineInstr *SecondLdSt,
|
||||
unsigned NumLoads) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
/// ReverseBranchCondition - Reverses the branch condition of the specified
|
||||
/// condition list, returning false on success and true if it cannot be
|
||||
/// reversed.
|
||||
|
|
|
@ -58,6 +58,10 @@ static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
|
|||
"before attempting to balance ILP"),
|
||||
cl::init(10U));
|
||||
|
||||
// Experimental heuristics
|
||||
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
|
||||
cl::desc("Enable load clustering."));
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Machine Instruction Scheduling Pass and Registry
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -303,6 +307,17 @@ void ReadyQueue::dump() {
|
|||
// preservation.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
|
||||
// Do not use WillCreateCycle, it assumes SD scheduling.
|
||||
// If Pred is reachable from Succ, then the edge creates a cycle.
|
||||
if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
|
||||
return false;
|
||||
Topo.AddPred(SuccSU, PredDep.getSUnit());
|
||||
SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
|
||||
// Return true regardless of whether a new edge needed to be inserted.
|
||||
return true;
|
||||
}
|
||||
|
||||
/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
|
||||
/// NumPredsLeft reaches zero, release the successor node.
|
||||
///
|
||||
|
@ -312,6 +327,8 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
|
|||
|
||||
if (SuccEdge->isWeak()) {
|
||||
--SuccSU->WeakPredsLeft;
|
||||
if (SuccEdge->isCluster())
|
||||
NextClusterSucc = SuccSU;
|
||||
return;
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
|
@ -344,6 +361,8 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
|
|||
|
||||
if (PredEdge->isWeak()) {
|
||||
--PredSU->WeakSuccsLeft;
|
||||
if (PredEdge->isCluster())
|
||||
NextClusterPred = PredSU;
|
||||
return;
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
|
@ -482,6 +501,8 @@ updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
|
|||
void ScheduleDAGMI::schedule() {
|
||||
buildDAGWithRegPressure();
|
||||
|
||||
Topo.InitDAGTopologicalSorting();
|
||||
|
||||
postprocessDAG();
|
||||
|
||||
DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
|
||||
|
@ -562,6 +583,8 @@ void ScheduleDAGMI::releaseRoots() {
|
|||
|
||||
/// Identify DAG roots and setup scheduler queues.
|
||||
void ScheduleDAGMI::initQueues() {
|
||||
NextClusterSucc = NULL;
|
||||
NextClusterPred = NULL;
|
||||
|
||||
// Initialize the strategy before modifying the DAG.
|
||||
SchedImpl->initialize(this);
|
||||
|
@ -664,6 +687,119 @@ void ScheduleDAGMI::dumpSchedule() const {
|
|||
}
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
/// \brief Post-process the DAG to create cluster edges between neighboring
|
||||
/// loads.
|
||||
class LoadClusterMutation : public ScheduleDAGMutation {
|
||||
struct LoadInfo {
|
||||
SUnit *SU;
|
||||
unsigned BaseReg;
|
||||
unsigned Offset;
|
||||
LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
|
||||
: SU(su), BaseReg(reg), Offset(ofs) {}
|
||||
};
|
||||
static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
|
||||
const LoadClusterMutation::LoadInfo &RHS);
|
||||
|
||||
const TargetInstrInfo *TII;
|
||||
const TargetRegisterInfo *TRI;
|
||||
public:
|
||||
LoadClusterMutation(const TargetInstrInfo *tii,
|
||||
const TargetRegisterInfo *tri)
|
||||
: TII(tii), TRI(tri) {}
|
||||
|
||||
virtual void apply(ScheduleDAGMI *DAG);
|
||||
protected:
|
||||
void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
|
||||
};
|
||||
} // anonymous
|
||||
|
||||
bool LoadClusterMutation::LoadInfoLess(
|
||||
const LoadClusterMutation::LoadInfo &LHS,
|
||||
const LoadClusterMutation::LoadInfo &RHS) {
|
||||
if (LHS.BaseReg != RHS.BaseReg)
|
||||
return LHS.BaseReg < RHS.BaseReg;
|
||||
return LHS.Offset < RHS.Offset;
|
||||
}
|
||||
|
||||
void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
|
||||
ScheduleDAGMI *DAG) {
|
||||
SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
|
||||
for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
|
||||
SUnit *SU = Loads[Idx];
|
||||
unsigned BaseReg;
|
||||
unsigned Offset;
|
||||
if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
|
||||
LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
|
||||
}
|
||||
if (LoadRecords.size() < 2)
|
||||
return;
|
||||
std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
|
||||
unsigned ClusterLength = 1;
|
||||
for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
|
||||
if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
|
||||
ClusterLength = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
SUnit *SUa = LoadRecords[Idx].SU;
|
||||
SUnit *SUb = LoadRecords[Idx+1].SU;
|
||||
if (TII->shouldScheduleLoadsNear(SUa->getInstr(), SUb->getInstr(),
|
||||
ClusterLength)
|
||||
&& DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
|
||||
|
||||
DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
|
||||
<< SUb->NodeNum << ")\n");
|
||||
// Copy successor edges from SUa to SUb. Interleaving computation
|
||||
// dependent on SUa can prevent load combining due to register reuse.
|
||||
// Predecessor edges do not need to be copied from SUb to SUa since nearby
|
||||
// loads should have effectively the same inputs.
|
||||
for (SUnit::const_succ_iterator
|
||||
SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
|
||||
if (SI->getSUnit() == SUb)
|
||||
continue;
|
||||
DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
|
||||
DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
|
||||
}
|
||||
++ClusterLength;
|
||||
}
|
||||
else
|
||||
ClusterLength = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Callback from DAG postProcessing to create cluster edges for loads.
|
||||
void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
|
||||
// Map DAG NodeNum to store chain ID.
|
||||
DenseMap<unsigned, unsigned> StoreChainIDs;
|
||||
// Map each store chain to a set of dependent loads.
|
||||
SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
|
||||
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
|
||||
SUnit *SU = &DAG->SUnits[Idx];
|
||||
if (!SU->getInstr()->mayLoad())
|
||||
continue;
|
||||
unsigned ChainPredID = DAG->SUnits.size();
|
||||
for (SUnit::const_pred_iterator
|
||||
PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
|
||||
if (PI->isCtrl()) {
|
||||
ChainPredID = PI->getSUnit()->NodeNum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Check if this chain-like pred has been seen
|
||||
// before. ChainPredID==MaxNodeID for loads at the top of the schedule.
|
||||
unsigned NumChains = StoreChainDependents.size();
|
||||
std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
|
||||
StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
|
||||
if (Result.second)
|
||||
StoreChainDependents.resize(NumChains + 1);
|
||||
StoreChainDependents[Result.first->second].push_back(SU);
|
||||
}
|
||||
// Iterate over the store chains.
|
||||
for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
|
||||
clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -676,9 +812,10 @@ public:
|
|||
/// Represent the type of SchedCandidate found within a single queue.
|
||||
/// pickNodeBidirectional depends on these listed by decreasing priority.
|
||||
enum CandReason {
|
||||
NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
|
||||
BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
|
||||
SingleMax, MultiPressure, NextDefUse, NodeOrder};
|
||||
NoCand, SingleExcess, SingleCritical, Cluster,
|
||||
ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
|
||||
TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
|
||||
NodeOrder};
|
||||
|
||||
#ifndef NDEBUG
|
||||
static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
|
||||
|
@ -1029,6 +1166,8 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
|
|||
|
||||
for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
|
||||
I != E; ++I) {
|
||||
if (I->isWeak())
|
||||
continue;
|
||||
unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
|
||||
unsigned MinLatency = I->getMinLatency();
|
||||
#ifndef NDEBUG
|
||||
|
@ -1424,6 +1563,7 @@ static bool tryLess(unsigned TryVal, unsigned CandVal,
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool tryGreater(unsigned TryVal, unsigned CandVal,
|
||||
ConvergingScheduler::SchedCandidate &TryCand,
|
||||
ConvergingScheduler::SchedCandidate &Cand,
|
||||
|
@ -1440,6 +1580,10 @@ static bool tryGreater(unsigned TryVal, unsigned CandVal,
|
|||
return false;
|
||||
}
|
||||
|
||||
static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
|
||||
return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
|
||||
}
|
||||
|
||||
/// Apply a set of heursitics to a new candidate. Heuristics are currently
|
||||
/// hierarchical. This may be more efficient than a graduated cost model because
|
||||
/// we don't need to evaluate all aspects of the model for each node in the
|
||||
|
@ -1482,6 +1626,26 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
|
|||
if (Cand.Reason == SingleCritical)
|
||||
Cand.Reason = MultiPressure;
|
||||
|
||||
// Keep clustered nodes together to encourage downstream peephole
|
||||
// optimizations which may reduce resource requirements.
|
||||
//
|
||||
// This is a best effort to set things up for a post-RA pass. Optimizations
|
||||
// like generating loads of multiple registers should ideally be done within
|
||||
// the scheduler pass by combining the loads during DAG postprocessing.
|
||||
const SUnit *NextClusterSU =
|
||||
Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
||||
if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
|
||||
TryCand, Cand, Cluster))
|
||||
return;
|
||||
// Currently, weak edges are for clustering, so we hard-code that reason.
|
||||
// However, deferring the current TryCand will not change Cand's reason.
|
||||
CandReason OrigReason = Cand.Reason;
|
||||
if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
|
||||
getWeakLeft(Cand.SU, Zone.isTop()),
|
||||
TryCand, Cand, Cluster)) {
|
||||
Cand.Reason = OrigReason;
|
||||
return;
|
||||
}
|
||||
// Avoid critical resource consumption and balance the schedule.
|
||||
TryCand.initResourceDelta(DAG, SchedModel);
|
||||
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
||||
|
@ -1528,15 +1692,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
|
|||
// Prefer immediate defs/users of the last scheduled instruction. This is a
|
||||
// nice pressure avoidance strategy that also conserves the processor's
|
||||
// register renaming resources and keeps the machine code readable.
|
||||
if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
|
||||
TryCand.Reason = NextDefUse;
|
||||
if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
|
||||
TryCand, Cand, NextDefUse))
|
||||
return;
|
||||
}
|
||||
if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
|
||||
if (Cand.Reason > NextDefUse)
|
||||
Cand.Reason = NextDefUse;
|
||||
return;
|
||||
}
|
||||
|
||||
// Fall through to original instruction order.
|
||||
if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
|
||||
|| (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
|
||||
|
@ -1582,6 +1741,7 @@ const char *ConvergingScheduler::getReasonStr(
|
|||
case NoCand: return "NOCAND ";
|
||||
case SingleExcess: return "REG-EXCESS";
|
||||
case SingleCritical: return "REG-CRIT ";
|
||||
case Cluster: return "CLUSTER ";
|
||||
case SingleMax: return "REG-MAX ";
|
||||
case MultiPressure: return "REG-MULTI ";
|
||||
case ResourceReduce: return "RES-REDUCE";
|
||||
|
@ -1822,7 +1982,11 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
|
|||
static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
|
||||
assert((!ForceTopDown || !ForceBottomUp) &&
|
||||
"-misched-topdown incompatible with -misched-bottomup");
|
||||
return new ScheduleDAGMI(C, new ConvergingScheduler());
|
||||
ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
|
||||
// Register DAG post-processors.
|
||||
if (EnableLoadCluster)
|
||||
DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
|
||||
return DAG;
|
||||
}
|
||||
static MachineSchedRegistry
|
||||
ConvergingSchedRegistry("converge", "Standard converging scheduler.",
|
||||
|
|
|
@ -1373,6 +1373,9 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
|
|||
/// only return true if the base pointers are the same and the only differences
|
||||
/// between the two addresses is the offset. It also returns the offsets by
|
||||
/// reference.
|
||||
///
|
||||
/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
|
||||
/// is permanently disabled.
|
||||
bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
|
||||
int64_t &Offset1,
|
||||
int64_t &Offset2) const {
|
||||
|
@ -1447,6 +1450,9 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
|
|||
/// from the common base address. It returns true if it decides it's desirable
|
||||
/// to schedule the two loads together. "NumLoads" is the number of loads that
|
||||
/// have already been scheduled after Load1.
|
||||
///
|
||||
/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
|
||||
/// is permanently disabled.
|
||||
bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
|
||||
int64_t Offset1, int64_t Offset2,
|
||||
unsigned NumLoads) const {
|
||||
|
|
Loading…
Reference in New Issue