2019-03-27 11:50:16 +08:00
|
|
|
//===- PPCMachineScheduler.cpp - MI Scheduler for PowerPC -------------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
2019-06-04 17:16:31 +08:00
|
|
|
|
2019-03-27 11:50:16 +08:00
|
|
|
#include "PPCMachineScheduler.h"
|
2019-06-04 17:16:31 +08:00
|
|
|
#include "MCTargetDesc/PPCMCTargetDesc.h"
|
|
|
|
|
2019-03-27 11:50:16 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2019-05-24 13:30:09 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
DisableAddiLoadHeuristic("disable-ppc-sched-addi-load",
|
|
|
|
cl::desc("Disable scheduling addi instruction before"
|
|
|
|
"load for ppc"), cl::Hidden);
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnableAddiHeuristic("ppc-postra-bias-addi",
|
|
|
|
cl::desc("Enable scheduling addi instruction as early"
|
|
|
|
"as possible post ra"),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
|
|
|
static bool isADDIInstr(const GenericScheduler::SchedCandidate &Cand) {
|
|
|
|
return Cand.SU->getInstr()->getOpcode() == PPC::ADDI ||
|
|
|
|
Cand.SU->getInstr()->getOpcode() == PPC::ADDI8;
|
2020-06-08 11:51:05 +08:00
|
|
|
}
|
2019-05-24 13:30:09 +08:00
|
|
|
|
|
|
|
bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
|
|
|
|
SchedCandidate &TryCand,
|
|
|
|
SchedBoundary &Zone) const {
|
|
|
|
if (DisableAddiLoadHeuristic)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand;
|
|
|
|
SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand;
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) {
|
2019-05-24 13:30:09 +08:00
|
|
|
TryCand.Reason = Stall;
|
|
|
|
return true;
|
|
|
|
}
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) {
|
2019-05-24 13:30:09 +08:00
|
|
|
TryCand.Reason = NoCand;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
|
|
|
|
SchedCandidate &TryCand,
|
|
|
|
SchedBoundary *Zone) const {
|
2021-01-22 10:00:28 +08:00
|
|
|
// From GenericScheduler::tryCandidate
|
2019-05-24 13:30:09 +08:00
|
|
|
|
2021-01-22 10:00:28 +08:00
|
|
|
// Initialize the candidate if needed.
|
|
|
|
if (!Cand.isValid()) {
|
|
|
|
TryCand.Reason = NodeOrder;
|
2019-05-24 13:30:09 +08:00
|
|
|
return;
|
2021-01-22 10:00:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Bias PhysReg Defs and copies to their uses and defined respectively.
|
|
|
|
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
|
|
|
|
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Avoid exceeding the target's limit.
|
|
|
|
if (DAG->isTrackingPressure() &&
|
|
|
|
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
|
|
|
|
RegExcess, TRI, DAG->MF))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Avoid increasing the max critical pressure in the scheduled region.
|
|
|
|
if (DAG->isTrackingPressure() &&
|
|
|
|
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
|
|
|
|
TryCand, Cand, RegCritical, TRI, DAG->MF))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// We only compare a subset of features when comparing nodes between
|
|
|
|
// Top and Bottom boundary. Some properties are simply incomparable, in many
|
|
|
|
// other instances we should only override the other boundary if something
|
|
|
|
// is a clear good pick on one boundary. Skip heuristics that are more
|
|
|
|
// "tie-breaking" in nature.
|
|
|
|
bool SameBoundary = Zone != nullptr;
|
|
|
|
if (SameBoundary) {
|
|
|
|
// For loops that are acyclic path limited, aggressively schedule for
|
|
|
|
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
|
|
|
|
// heuristics to take precedence.
|
|
|
|
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
|
|
|
|
tryLatency(TryCand, Cand, *Zone))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Prioritize instructions that read unbuffered resources by stall cycles.
|
|
|
|
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
|
|
|
|
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Keep clustered nodes together to encourage downstream peephole
|
|
|
|
// optimizations which may reduce resource requirements.
|
|
|
|
//
|
|
|
|
// This is a best effort to set things up for a post-RA pass. Optimizations
|
|
|
|
// like generating loads of multiple registers should ideally be done within
|
|
|
|
// the scheduler pass by combining the loads during DAG postprocessing.
|
|
|
|
const SUnit *CandNextClusterSU =
|
|
|
|
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
|
|
|
const SUnit *TryCandNextClusterSU =
|
|
|
|
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
|
|
|
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
|
|
|
|
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (SameBoundary) {
|
|
|
|
// Weak edges are for clustering and other constraints.
|
|
|
|
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
|
|
|
|
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Avoid increasing the max pressure of the entire region.
|
|
|
|
if (DAG->isTrackingPressure() &&
|
|
|
|
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
|
|
|
|
Cand, RegMax, TRI, DAG->MF))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (SameBoundary) {
|
|
|
|
// Avoid critical resource consumption and balance the schedule.
|
|
|
|
TryCand.initResourceDelta(DAG, SchedModel);
|
|
|
|
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
|
|
|
TryCand, Cand, ResourceReduce))
|
|
|
|
return;
|
|
|
|
if (tryGreater(TryCand.ResDelta.DemandedResources,
|
|
|
|
Cand.ResDelta.DemandedResources, TryCand, Cand,
|
|
|
|
ResourceDemand))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Avoid serializing long latency dependence chains.
|
|
|
|
// For acyclic path limited loops, latency was already checked above.
|
|
|
|
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
|
|
|
|
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Fall through to original instruction order.
|
|
|
|
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
|
|
|
|
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
|
|
|
|
TryCand.Reason = NodeOrder;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// GenericScheduler::tryCandidate end
|
2019-05-24 13:30:09 +08:00
|
|
|
|
|
|
|
// Add powerpc specific heuristic only when TryCand isn't selected or
|
|
|
|
// selected as node order.
|
|
|
|
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// There are some benefits to schedule the ADDI before the load to hide the
|
|
|
|
// latency, as RA may create a true dependency between the load and addi.
|
2021-01-22 10:00:28 +08:00
|
|
|
if (SameBoundary) {
|
|
|
|
if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
|
|
|
|
return;
|
|
|
|
}
|
2019-05-24 13:30:09 +08:00
|
|
|
}
|
|
|
|
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
|
|
|
|
SchedCandidate &TryCand) const {
|
|
|
|
if (!EnableAddiHeuristic)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) {
|
|
|
|
TryCand.Reason = Stall;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
|
|
|
|
SchedCandidate &TryCand) {
|
2021-01-22 10:00:28 +08:00
|
|
|
// From PostGenericScheduler::tryCandidate
|
|
|
|
|
|
|
|
// Initialize the candidate if needed.
|
|
|
|
if (!Cand.isValid()) {
|
|
|
|
TryCand.Reason = NodeOrder;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Prioritize instructions that read unbuffered resources by stall cycles.
|
|
|
|
if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
|
|
|
|
Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
|
|
|
|
return;
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
|
2021-01-22 10:00:28 +08:00
|
|
|
// Keep clustered nodes together.
|
|
|
|
if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
|
|
|
|
Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
return;
|
|
|
|
|
2021-01-22 10:00:28 +08:00
|
|
|
// Avoid critical resource consumption and balance the schedule.
|
|
|
|
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
|
|
|
TryCand, Cand, ResourceReduce))
|
|
|
|
return;
|
|
|
|
if (tryGreater(TryCand.ResDelta.DemandedResources,
|
|
|
|
Cand.ResDelta.DemandedResources, TryCand, Cand,
|
|
|
|
ResourceDemand))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Avoid serializing long latency dependence chains.
|
|
|
|
if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fall through to original instruction order.
|
|
|
|
if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
|
|
|
|
TryCand.Reason = NodeOrder;
|
|
|
|
|
|
|
|
// PostGenericScheduler::tryCandidate end
|
|
|
|
|
[Power9] Add addi post-ra scheduling heuristic
The instruction addi is usually used to post increase the loop indvar, which looks like this:
label_X:
load x, base(i)
...
y = op x
...
i = addi i, 1
goto label_X
However, for PowerPC, if there are too many vsx instructions that between y = op x and i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.
Reviewed By: jji
Differential Revision: https://reviews.llvm.org/D80269
2020-06-08 09:31:07 +08:00
|
|
|
// Add powerpc post ra specific heuristic only when TryCand isn't selected or
|
|
|
|
// selected as node order.
|
|
|
|
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// There are some benefits to schedule the ADDI as early as possible post ra
|
|
|
|
// to avoid stalled by vector instructions which take up all the hw units.
|
|
|
|
// And ADDI is usually used to post inc the loop indvar, which matters the
|
|
|
|
// performance.
|
|
|
|
if (biasAddiCandidate(Cand, TryCand))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-03-27 11:50:16 +08:00
|
|
|
void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
|
|
|
|
// Custom PPC PostRA specific behavior here.
|
|
|
|
PostGenericScheduler::enterMBB(MBB);
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCPostRASchedStrategy::leaveMBB() {
|
|
|
|
// Custom PPC PostRA specific behavior here.
|
|
|
|
PostGenericScheduler::leaveMBB();
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) {
|
|
|
|
// Custom PPC PostRA specific initialization here.
|
|
|
|
PostGenericScheduler::initialize(Dag);
|
|
|
|
}
|
|
|
|
|
|
|
|
SUnit *PPCPostRASchedStrategy::pickNode(bool &IsTopNode) {
|
|
|
|
// Custom PPC PostRA specific scheduling here.
|
|
|
|
return PostGenericScheduler::pickNode(IsTopNode);
|
|
|
|
}
|
|
|
|
|