2017-09-12 07:00:48 +08:00
|
|
|
//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
|
2016-07-30 00:44:44 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2016-07-30 00:44:44 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
|
|
|
|
//
|
|
|
|
// This SMS implementation is a target-independent back-end pass. When enabled,
|
|
|
|
// the pass runs just prior to the register allocation pass, while the machine
|
|
|
|
// IR is in SSA form. If software pipelining is successful, then the original
|
|
|
|
// loop is replaced by the optimized loop. The optimized loop contains one or
|
|
|
|
// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
|
|
|
|
// the instructions cannot be scheduled in a given MII, we increase the MII by
|
|
|
|
// one and try again.
|
|
|
|
//
|
|
|
|
// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
|
|
|
|
// represent loop carried dependences in the DAG as order edges to the Phi
|
|
|
|
// nodes. We also perform several passes over the DAG to eliminate unnecessary
|
|
|
|
// edges that inhibit the ability to pipeline. The implementation uses the
|
|
|
|
// DFAPacketizer class to compute the minimum initiation interval and the check
|
|
|
|
// where an instruction may be inserted in the pipelined schedule.
|
|
|
|
//
|
|
|
|
// In order for the SMS pass to work, several target specific hooks need to be
|
|
|
|
// implemented to get information about the loop structure and to rewrite
|
|
|
|
// instructions.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
|
|
#include "llvm/ADT/BitVector.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/ADT/DenseMap.h"
|
|
|
|
#include "llvm/ADT/MapVector.h"
|
|
|
|
#include "llvm/ADT/PriorityQueue.h"
|
|
|
|
#include "llvm/ADT/SetVector.h"
|
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
#include "llvm/ADT/SmallSet.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/ADT/iterator_range.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/Analysis/MemoryLocation.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
|
|
#include "llvm/CodeGen/DFAPacketizer.h"
|
2017-12-13 10:51:04 +08:00
|
|
|
#include "llvm/CodeGen/LiveIntervals.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/MachineLoopInfo.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/CodeGen/MachineMemOperand.h"
|
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
2019-01-15 01:24:11 +08:00
|
|
|
#include "llvm/CodeGen/MachinePipeliner.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2019-08-31 02:49:50 +08:00
|
|
|
#include "llvm/CodeGen/ModuloSchedule.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/CodeGen/RegisterPressure.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/CodeGen/ScheduleDAG.h"
|
2016-12-23 03:21:20 +08:00
|
|
|
#include "llvm/CodeGen/ScheduleDAGMutation.h"
|
2017-11-17 09:07:10 +08:00
|
|
|
#include "llvm/CodeGen/TargetOpcodes.h"
|
|
|
|
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
|
|
|
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
2018-04-30 22:59:11 +08:00
|
|
|
#include "llvm/Config/llvm-config.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/IR/Attributes.h"
|
|
|
|
#include "llvm/IR/DebugLoc.h"
|
2017-09-12 07:00:48 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/MC/LaneBitmask.h"
|
|
|
|
#include "llvm/MC/MCInstrDesc.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/MC/MCInstrItineraries.h"
|
2017-09-12 07:00:48 +08:00
|
|
|
#include "llvm/MC/MCRegisterInfo.h"
|
|
|
|
#include "llvm/Pass.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2017-09-12 07:00:48 +08:00
|
|
|
#include "llvm/Support/Compiler.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2016-07-30 00:44:44 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
2016-07-30 00:44:44 +08:00
|
|
|
#include <climits>
|
2016-08-12 01:20:18 +08:00
|
|
|
#include <cstdint>
|
2016-07-30 00:44:44 +08:00
|
|
|
#include <deque>
|
2016-08-12 01:20:18 +08:00
|
|
|
#include <functional>
|
|
|
|
#include <iterator>
|
2016-07-30 00:44:44 +08:00
|
|
|
#include <map>
|
2017-09-12 07:00:48 +08:00
|
|
|
#include <memory>
|
2016-08-12 01:20:18 +08:00
|
|
|
#include <tuple>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "pipeliner"
|
|
|
|
|
|
|
|
STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
|
|
|
|
STATISTIC(NumPipelined, "Number of loops software pipelined");
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
|
2019-05-31 23:35:19 +08:00
|
|
|
STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
|
|
|
|
STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
|
|
|
|
STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
|
|
|
|
STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
|
|
|
|
STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
|
|
|
|
STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
|
|
|
|
STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
|
|
|
|
STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
/// A command line option to turn software pipelining on or off.
|
2016-08-06 19:13:10 +08:00
|
|
|
static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
|
|
|
|
cl::ZeroOrMore,
|
|
|
|
cl::desc("Enable Software Pipelining"));
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
/// A command line option to enable SWP at -Os.
|
|
|
|
static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
|
|
|
|
cl::desc("Enable SWP at Os."), cl::Hidden,
|
|
|
|
cl::init(false));
|
|
|
|
|
|
|
|
/// A command line argument to limit minimum initial interval for pipelining.
|
|
|
|
static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
|
2018-01-17 20:29:38 +08:00
|
|
|
cl::desc("Size limit for the MII."),
|
2016-07-30 00:44:44 +08:00
|
|
|
cl::Hidden, cl::init(27));
|
|
|
|
|
|
|
|
/// A command line argument to limit the number of stages in the pipeline.
|
|
|
|
static cl::opt<int>
|
|
|
|
SwpMaxStages("pipeliner-max-stages",
|
|
|
|
cl::desc("Maximum stages allowed in the generated scheduled."),
|
|
|
|
cl::Hidden, cl::init(3));
|
|
|
|
|
|
|
|
/// A command line option to disable the pruning of chain dependences due to
|
|
|
|
/// an unrelated Phi.
|
|
|
|
static cl::opt<bool>
|
|
|
|
SwpPruneDeps("pipeliner-prune-deps",
|
|
|
|
cl::desc("Prune dependences between unrelated Phi nodes."),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
|
|
|
/// A command line option to disable the pruning of loop carried order
|
|
|
|
/// dependences.
|
|
|
|
static cl::opt<bool>
|
|
|
|
SwpPruneLoopCarried("pipeliner-prune-loop-carried",
|
|
|
|
cl::desc("Prune loop carried order dependences."),
|
|
|
|
cl::Hidden, cl::init(true));
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
|
|
|
|
cl::ReallyHidden, cl::init(false),
|
|
|
|
cl::ZeroOrMore, cl::desc("Ignore RecMII"));
|
|
|
|
|
2019-06-19 04:24:49 +08:00
|
|
|
static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
|
|
|
|
cl::init(false));
|
|
|
|
static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
|
|
|
|
cl::init(false));
|
|
|
|
|
2019-09-03 16:20:31 +08:00
|
|
|
static cl::opt<bool> EmitTestAnnotations(
|
|
|
|
"pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
|
|
|
|
cl::desc("Instead of emitting the pipelined code, annotate instructions "
|
|
|
|
"with the generated schedule for feeding into the "
|
|
|
|
"-modulo-schedule-test pass"));
|
|
|
|
|
2019-09-04 20:54:24 +08:00
|
|
|
static cl::opt<bool> ExperimentalCodeGen(
|
|
|
|
"pipeliner-experimental-cg", cl::Hidden, cl::init(false),
|
|
|
|
cl::desc(
|
|
|
|
"Use the experimental peeling code generator for software pipelining"));
|
|
|
|
|
2019-01-15 01:24:11 +08:00
|
|
|
namespace llvm {
|
|
|
|
|
2018-10-23 15:58:41 +08:00
|
|
|
// A command line option to enable the CopyToPhi DAG mutation.
|
2019-01-15 01:24:11 +08:00
|
|
|
cl::opt<bool>
|
2018-10-23 22:27:45 +08:00
|
|
|
SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
|
|
|
|
cl::init(true), cl::ZeroOrMore,
|
|
|
|
cl::desc("Enable CopyToPhi DAG Mutation"));
|
|
|
|
|
2019-01-15 01:24:11 +08:00
|
|
|
} // end namespace llvm
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
|
|
|
|
char MachinePipeliner::ID = 0;
|
|
|
|
#ifndef NDEBUG
|
|
|
|
int MachinePipeliner::NumTries = 0;
|
|
|
|
#endif
|
|
|
|
char &llvm::MachinePipelinerID = MachinePipeliner::ID;
|
2017-09-12 07:00:48 +08:00
|
|
|
|
2017-05-26 05:26:32 +08:00
|
|
|
INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
|
2016-07-30 00:44:44 +08:00
|
|
|
"Modulo Software Pipelining", false, false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
|
2017-05-26 05:26:32 +08:00
|
|
|
INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
|
2016-07-30 00:44:44 +08:00
|
|
|
"Modulo Software Pipelining", false, false)
|
|
|
|
|
|
|
|
/// The "main" function for implementing Swing Modulo Scheduling.
|
|
|
|
bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
|
2017-12-16 06:22:58 +08:00
|
|
|
if (skipFunction(mf.getFunction()))
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!EnableSWP)
|
|
|
|
return false;
|
|
|
|
|
2017-12-16 06:22:58 +08:00
|
|
|
if (mf.getFunction().getAttributes().hasAttribute(
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-22 00:57:19 +08:00
|
|
|
AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
|
2016-07-30 00:44:44 +08:00
|
|
|
!EnableSWPOptSize.getPosition())
|
|
|
|
return false;
|
|
|
|
|
2019-06-12 01:40:39 +08:00
|
|
|
if (!mf.getSubtarget().enableMachinePipeliner())
|
|
|
|
return false;
|
|
|
|
|
2019-05-29 11:02:59 +08:00
|
|
|
// Cannot pipeline loops without instruction itineraries if we are using
|
|
|
|
// DFA for the pipeliner.
|
|
|
|
if (mf.getSubtarget().useDFAforSMS() &&
|
|
|
|
(!mf.getSubtarget().getInstrItineraryData() ||
|
|
|
|
mf.getSubtarget().getInstrItineraryData()->isEmpty()))
|
|
|
|
return false;
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
MF = &mf;
|
|
|
|
MLI = &getAnalysis<MachineLoopInfo>();
|
|
|
|
MDT = &getAnalysis<MachineDominatorTree>();
|
2020-05-05 22:27:59 +08:00
|
|
|
ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
|
2016-07-30 00:44:44 +08:00
|
|
|
TII = MF->getSubtarget().getInstrInfo();
|
|
|
|
RegClassInfo.runOnMachineFunction(*MF);
|
|
|
|
|
|
|
|
for (auto &L : *MLI)
|
|
|
|
scheduleLoop(*L);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Attempt to perform the SMS algorithm on the specified loop. This function is
|
|
|
|
/// the main entry point for the algorithm. The function identifies candidate
|
|
|
|
/// loops, calculates the minimum initiation interval, and attempts to schedule
|
|
|
|
/// the loop.
|
|
|
|
bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
|
|
|
|
bool Changed = false;
|
|
|
|
for (auto &InnerLoop : L)
|
|
|
|
Changed |= scheduleLoop(*InnerLoop);
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
// Stop trying after reaching the limit (if any).
|
|
|
|
int Limit = SwpLoopLimit;
|
|
|
|
if (Limit >= 0) {
|
|
|
|
if (NumTries >= SwpLoopLimit)
|
|
|
|
return Changed;
|
|
|
|
NumTries++;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
setPragmaPipelineOptions(L);
|
|
|
|
if (!canPipelineLoop(L)) {
|
|
|
|
LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
|
2020-05-05 22:27:59 +08:00
|
|
|
ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
|
|
|
|
L.getStartLoc(), L.getHeader())
|
|
|
|
<< "Failed to pipeline loop";
|
|
|
|
});
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
return Changed;
|
2019-01-23 11:26:10 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
++NumTrytoPipeline;
|
|
|
|
|
|
|
|
Changed = swingModuloScheduler(L);
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
|
2020-04-11 01:50:43 +08:00
|
|
|
// Reset the pragma for the next loop in iteration.
|
|
|
|
disabledByPragma = false;
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
MachineBasicBlock *LBLK = L.getTopBlock();
|
|
|
|
|
|
|
|
if (LBLK == nullptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
const BasicBlock *BBLK = LBLK->getBasicBlock();
|
|
|
|
if (BBLK == nullptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
const Instruction *TI = BBLK->getTerminator();
|
|
|
|
if (TI == nullptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
|
|
|
|
if (LoopID == nullptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
|
|
|
|
assert(LoopID->getOperand(0) == LoopID && "invalid loop");
|
|
|
|
|
|
|
|
for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
|
|
|
|
MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
|
|
|
|
|
|
|
|
if (MD == nullptr)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
MDString *S = dyn_cast<MDString>(MD->getOperand(0));
|
|
|
|
|
|
|
|
if (S == nullptr)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
|
|
|
|
assert(MD->getNumOperands() == 2 &&
|
|
|
|
"Pipeline initiation interval hint metadata should have two operands.");
|
|
|
|
II_setByPragma =
|
|
|
|
mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
|
|
|
|
assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
|
|
|
|
} else if (S->getString() == "llvm.loop.pipeline.disable") {
|
|
|
|
disabledByPragma = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
/// Return true if the loop can be software pipelined. The algorithm is
|
|
|
|
/// restricted to loops with a single basic block. Make sure that the
|
|
|
|
/// branch in the loop can be analyzed.
|
|
|
|
bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
|
2020-05-05 22:27:59 +08:00
|
|
|
if (L.getNumBlocks() != 1) {
|
|
|
|
ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
|
|
|
|
L.getStartLoc(), L.getHeader())
|
|
|
|
<< "Not a single basic block: "
|
|
|
|
<< ore::NV("NumBlocks", L.getNumBlocks());
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
2020-05-05 22:27:59 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2020-05-05 22:27:59 +08:00
|
|
|
if (disabledByPragma) {
|
|
|
|
ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
|
|
|
|
L.getStartLoc(), L.getHeader())
|
|
|
|
<< "Disabled by Pragma.";
|
|
|
|
});
|
2019-01-23 11:26:10 +08:00
|
|
|
return false;
|
2020-05-05 22:27:59 +08:00
|
|
|
}
|
2019-01-23 11:26:10 +08:00
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
// Check if the branch can't be understood because we can't do pipelining
|
|
|
|
// if that's the case.
|
|
|
|
LI.TBB = nullptr;
|
|
|
|
LI.FBB = nullptr;
|
|
|
|
LI.BrCond.clear();
|
2019-05-31 23:35:19 +08:00
|
|
|
if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
|
2020-05-05 22:27:59 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
|
2019-05-31 23:35:19 +08:00
|
|
|
NumFailBranch++;
|
2020-05-05 22:27:59 +08:00
|
|
|
ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
|
|
|
|
L.getStartLoc(), L.getHeader())
|
|
|
|
<< "The branch can't be understood";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
LI.LoopInductionVar = nullptr;
|
|
|
|
LI.LoopCompare = nullptr;
|
[MachinePipeliner] Improve the TargetInstrInfo API analyzeLoop/reduceLoopCount
Recommit: fix asan errors.
The way MachinePipeliner uses these target hooks is stateful - we reduce trip
count by one per call to reduceLoopCount. It's a little overfit for hardware
loops, where we don't have to worry about stitching a loop induction variable
across prologs and epilogs (the induction variable is implicit).
This patch introduces a new API:
/// Analyze loop L, which must be a single-basic-block loop, and if the
/// conditions can be understood enough produce a PipelinerLoopInfo object.
virtual std::unique_ptr<PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const;
The return value is expected to be an implementation of the abstract class:
/// Object returned by analyzeLoopForPipelining. Allows software pipelining
/// implementations to query attributes of the loop being pipelined.
class PipelinerLoopInfo {
public:
virtual ~PipelinerLoopInfo();
/// Return true if the given instruction should not be pipelined and should
/// be ignored. An example could be a loop comparison, or induction variable
/// update with no users being pipelined.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const = 0;
/// Create a condition to determine if the trip count of the loop is greater
/// than TC.
///
/// If the trip count is statically known to be greater than TC, return
/// true. If the trip count is statically known to be not greater than TC,
/// return false. Otherwise return nullopt and fill out Cond with the test
/// condition.
virtual Optional<bool>
createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
SmallVectorImpl<MachineOperand> &Cond) = 0;
/// Modify the loop such that the trip count is
/// OriginalTC + TripCountAdjust.
virtual void adjustTripCount(int TripCountAdjust) = 0;
/// Called when the loop's preheader has been modified to NewPreheader.
virtual void setPreheader(MachineBasicBlock *NewPreheader) = 0;
/// Called when the loop is being removed.
virtual void disposed() = 0;
};
The Pipeliner (ModuloSchedule.cpp) can use this object to modify the loop while
allowing the target to hold its own state across all calls. This API, in
particular the disjunction of creating a trip count check condition and
adjusting the loop, improves the code quality in ModuloSchedule.cpp.
llvm-svn: 372463
2019-09-21 16:19:41 +08:00
|
|
|
if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
|
2020-05-05 22:27:59 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
|
2019-05-31 23:35:19 +08:00
|
|
|
NumFailLoop++;
|
2020-05-05 22:27:59 +08:00
|
|
|
ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
|
|
|
|
L.getStartLoc(), L.getHeader())
|
|
|
|
<< "The loop structure is not supported";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2019-05-31 23:35:19 +08:00
|
|
|
if (!L.getLoopPreheader()) {
|
2020-05-05 22:27:59 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
|
2019-05-31 23:35:19 +08:00
|
|
|
NumFailPreheader++;
|
2020-05-05 22:27:59 +08:00
|
|
|
ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
|
|
|
|
L.getStartLoc(), L.getHeader())
|
|
|
|
<< "No loop preheader found";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2018-03-22 00:39:11 +08:00
|
|
|
// Remove any subregisters from inputs to phi nodes.
|
|
|
|
preprocessPhiNodes(*L.getHeader());
|
2016-07-30 00:44:44 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-03-22 00:39:11 +08:00
|
|
|
void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
|
|
|
|
|
|
|
|
for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
|
|
|
|
MachineOperand &DefOp = PI.getOperand(0);
|
|
|
|
assert(DefOp.getSubReg() == 0);
|
|
|
|
auto *RC = MRI.getRegClass(DefOp.getReg());
|
|
|
|
|
|
|
|
for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
|
|
|
|
MachineOperand &RegOp = PI.getOperand(i);
|
|
|
|
if (RegOp.getSubReg() == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// If the operand uses a subregister, replace it with a new register
|
|
|
|
// without subregisters, and generate a copy to the new register.
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register NewReg = MRI.createVirtualRegister(RC);
|
2018-03-22 00:39:11 +08:00
|
|
|
MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
|
|
|
|
MachineBasicBlock::iterator At = PredB.getFirstTerminator();
|
|
|
|
const DebugLoc &DL = PredB.findDebugLoc(At);
|
|
|
|
auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
|
|
|
|
.addReg(RegOp.getReg(), getRegState(RegOp),
|
|
|
|
RegOp.getSubReg());
|
|
|
|
Slots.insertMachineInstrInMaps(*Copy);
|
|
|
|
RegOp.setReg(NewReg);
|
|
|
|
RegOp.setSubReg(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
/// The SMS algorithm consists of the following main steps:
|
|
|
|
/// 1. Computation and analysis of the dependence graph.
|
|
|
|
/// 2. Ordering of the nodes (instructions).
|
|
|
|
/// 3. Attempt to Schedule the loop.
|
|
|
|
bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
|
|
|
|
assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
|
|
|
|
II_setByPragma);
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
MachineBasicBlock *MBB = L.getHeader();
|
|
|
|
// The kernel should not include any terminator instructions. These
|
|
|
|
// will be added back later.
|
|
|
|
SMS.startBlock(MBB);
|
|
|
|
|
|
|
|
// Compute the number of 'real' instructions in the basic block by
|
|
|
|
// ignoring terminators.
|
|
|
|
unsigned size = MBB->size();
|
|
|
|
for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
|
|
|
|
E = MBB->instr_end();
|
|
|
|
I != E; ++I, --size)
|
|
|
|
;
|
|
|
|
|
|
|
|
SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
|
|
|
|
SMS.schedule();
|
|
|
|
SMS.exitRegion();
|
|
|
|
|
|
|
|
SMS.finishBlock();
|
|
|
|
return SMS.hasNewSchedule();
|
|
|
|
}
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
|
|
|
|
if (II_setByPragma > 0)
|
|
|
|
MII = II_setByPragma;
|
|
|
|
else
|
|
|
|
MII = std::max(ResMII, RecMII);
|
|
|
|
}
|
|
|
|
|
|
|
|
void SwingSchedulerDAG::setMAX_II() {
|
|
|
|
if (II_setByPragma > 0)
|
|
|
|
MAX_II = II_setByPragma;
|
|
|
|
else
|
|
|
|
MAX_II = MII + 10;
|
|
|
|
}
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
/// We override the schedule function in ScheduleDAGInstrs to implement the
|
|
|
|
/// scheduling part of the Swing Modulo Scheduling algorithm.
|
|
|
|
void SwingSchedulerDAG::schedule() {
|
|
|
|
AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
|
|
|
|
buildSchedGraph(AA);
|
|
|
|
addLoopCarriedDependences(AA);
|
|
|
|
updatePhiDependences();
|
|
|
|
Topo.InitDAGTopologicalSorting();
|
|
|
|
changeDependences();
|
2018-10-18 23:51:16 +08:00
|
|
|
postprocessDAG();
|
2018-09-19 08:23:35 +08:00
|
|
|
LLVM_DEBUG(dump());
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
NodeSetType NodeSets;
|
|
|
|
findCircuits(NodeSets);
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
NodeSetType Circuits = NodeSets;
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
// Calculate the MII.
|
|
|
|
unsigned ResMII = calculateResMII();
|
|
|
|
unsigned RecMII = calculateRecMII(NodeSets);
|
|
|
|
|
|
|
|
fuseRecs(NodeSets);
|
|
|
|
|
|
|
|
// This flag is used for testing and can cause correctness problems.
|
|
|
|
if (SwpIgnoreRecMII)
|
|
|
|
RecMII = 0;
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
setMII(ResMII, RecMII);
|
|
|
|
setMAX_II();
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
|
|
|
|
<< " (rec=" << RecMII << ", res=" << ResMII << ")\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
// Can't schedule a loop without a valid MII.
|
2019-05-31 23:35:19 +08:00
|
|
|
if (MII == 0) {
|
2020-05-05 22:27:59 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
|
2019-05-31 23:35:19 +08:00
|
|
|
NumFailZeroMII++;
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(
|
|
|
|
DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
|
|
|
|
<< "Invalid Minimal Initiation Interval: 0";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
// Don't pipeline large loops.
|
2019-05-31 23:35:19 +08:00
|
|
|
if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
|
|
|
|
LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
|
|
|
|
<< ", we don't pipleline large loops\n");
|
|
|
|
NumFailLargeMaxMII++;
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(
|
|
|
|
DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
|
|
|
|
<< "Minimal Initiation Interval too large: "
|
|
|
|
<< ore::NV("MII", (int)MII) << " > "
|
|
|
|
<< ore::NV("SwpMaxMii", SwpMaxMii) << "."
|
|
|
|
<< "Refer to -pipeliner-max-mii.";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
computeNodeFunctions(NodeSets);
|
|
|
|
|
|
|
|
registerPressureFilter(NodeSets);
|
|
|
|
|
|
|
|
colocateNodeSets(NodeSets);
|
|
|
|
|
|
|
|
checkNodeSets(NodeSets);
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
for (auto &I : NodeSets) {
|
|
|
|
dbgs() << " Rec NodeSet ";
|
|
|
|
I.dump();
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2019-04-23 22:51:27 +08:00
|
|
|
llvm::stable_sort(NodeSets, std::greater<NodeSet>());
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
groupRemainingNodes(NodeSets);
|
|
|
|
|
|
|
|
removeDuplicateNodes(NodeSets);
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
for (auto &I : NodeSets) {
|
|
|
|
dbgs() << " NodeSet ";
|
|
|
|
I.dump();
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
computeNodeOrder(NodeSets);
|
|
|
|
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
// check for node order issues
|
|
|
|
checkValidNodeOrder(Circuits);
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
SMSchedule Schedule(Pass.MF);
|
|
|
|
Scheduled = schedulePipeline(Schedule);
|
|
|
|
|
2019-05-31 23:35:19 +08:00
|
|
|
if (!Scheduled){
|
|
|
|
LLVM_DEBUG(dbgs() << "No schedule found, return\n");
|
|
|
|
NumFailNoSchedule++;
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(
|
|
|
|
DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
|
|
|
|
<< "Unable to find schedule";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
unsigned numStages = Schedule.getMaxStageCount();
|
|
|
|
// No need to generate pipeline if there are no overlapped iterations.
|
2019-05-31 23:35:19 +08:00
|
|
|
if (numStages == 0) {
|
2020-05-05 22:27:59 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
|
2019-05-31 23:35:19 +08:00
|
|
|
NumFailZeroStage++;
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(
|
|
|
|
DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
|
|
|
|
<< "No need to pipeline - no overlapped iterations in schedule.";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
// Check that the maximum stage count is less than user-defined limit.
|
2019-05-31 23:35:19 +08:00
|
|
|
if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
|
|
|
|
LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
|
|
|
|
<< " : too many stages, abort\n");
|
|
|
|
NumFailLargeMaxStage++;
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(
|
|
|
|
DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
|
|
|
|
<< "Too many stages in schedule: "
|
|
|
|
<< ore::NV("numStages", (int)numStages) << " > "
|
|
|
|
<< ore::NV("SwpMaxStages", SwpMaxStages)
|
|
|
|
<< ". Refer to -pipeliner-max-stages.";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
|
|
|
|
Loop.getHeader())
|
|
|
|
<< "Pipelined succesfully!";
|
|
|
|
});
|
|
|
|
|
2019-08-31 02:49:50 +08:00
|
|
|
// Generate the schedule as a ModuloSchedule.
|
|
|
|
DenseMap<MachineInstr *, int> Cycles, Stages;
|
|
|
|
std::vector<MachineInstr *> OrderedInsts;
|
|
|
|
for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
|
|
|
|
++Cycle) {
|
|
|
|
for (SUnit *SU : Schedule.getInstructions(Cycle)) {
|
|
|
|
OrderedInsts.push_back(SU->getInstr());
|
|
|
|
Cycles[SU->getInstr()] = Cycle;
|
|
|
|
Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
|
|
|
|
for (auto &KV : NewMIs) {
|
|
|
|
Cycles[KV.first] = Cycles[KV.second];
|
|
|
|
Stages[KV.first] = Stages[KV.second];
|
|
|
|
NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
|
|
|
|
}
|
|
|
|
|
|
|
|
ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
|
|
|
|
std::move(Stages));
|
2019-09-03 16:20:31 +08:00
|
|
|
if (EmitTestAnnotations) {
|
|
|
|
assert(NewInstrChanges.empty() &&
|
|
|
|
"Cannot serialize a schedule with InstrChanges!");
|
|
|
|
ModuloScheduleTestAnnotater MSTI(MF, MS);
|
|
|
|
MSTI.annotate();
|
|
|
|
return;
|
|
|
|
}
|
2019-09-04 20:54:24 +08:00
|
|
|
// The experimental code generator can't work if there are InstChanges.
|
|
|
|
if (ExperimentalCodeGen && NewInstrChanges.empty()) {
|
|
|
|
PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
|
[ModuloSchedule] Peel out prologs and epilogs, generate actual code
Summary:
This extends the PeelingModuloScheduleExpander to generate prolog and epilog code,
and correctly stitch uses through the prolog, kernel, epilog DAG.
The key concept in this patch is to ensure that all transforms are *local*; only a
function of a block and its immediate predecessor and successor. By defining the problem in this way
we can inductively rewrite the entire DAG using only local knowledge that is easy to
reason about.
For example, we assume that all prologs and epilogs are near-perfect clones of the
steady-state kernel. This means that if a block has an instruction that is predicated out,
we can redirect all users of that instruction to that equivalent instruction in our
immediate predecessor. As all blocks are clones, every instruction must have an equivalent in
every other block.
Similarly we can make the assumption by construction that if a value defined in a block is used
outside that block, the only possible user is its immediate successors. We maintain this
even for values that are used outside the loop by creating a limited form of LCSSA.
This code isn't small, but it isn't complex.
Enabled a bunch of testing from Hexagon. There are a couple of tests not enabled yet;
I'm about 80% sure there isn't buggy codegen but the tests are checking for patterns
that we don't produce. Those still need a bit more investigation. In the meantime we
(Google) are happy with the code produced by this on our downstream SMS implementation,
and believe it generates correct code.
Subscribers: mgorny, hiraditya, jsji, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68205
llvm-svn: 373462
2019-10-02 20:46:44 +08:00
|
|
|
MSE.expand();
|
2019-09-04 20:54:24 +08:00
|
|
|
} else {
|
|
|
|
ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
|
|
|
|
MSE.expand();
|
|
|
|
MSE.cleanup();
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
++NumPipelined;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Clean up after the software pipeliner runs.
|
|
|
|
void SwingSchedulerDAG::finishBlock() {
|
2019-08-31 02:49:50 +08:00
|
|
|
for (auto &KV : NewMIs)
|
|
|
|
MF.DeleteMachineInstr(KV.second);
|
2016-07-30 00:44:44 +08:00
|
|
|
NewMIs.clear();
|
|
|
|
|
|
|
|
// Call the superclass.
|
|
|
|
ScheduleDAGInstrs::finishBlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the register values for the operands of a Phi instruction.
|
|
|
|
/// This function assume the instruction is a Phi.
|
|
|
|
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
|
|
|
|
unsigned &InitVal, unsigned &LoopVal) {
|
|
|
|
assert(Phi.isPHI() && "Expecting a Phi.");
|
|
|
|
|
|
|
|
InitVal = 0;
|
|
|
|
LoopVal = 0;
|
|
|
|
for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
|
|
|
|
if (Phi.getOperand(i + 1).getMBB() != Loop)
|
|
|
|
InitVal = Phi.getOperand(i).getReg();
|
2017-03-17 03:52:00 +08:00
|
|
|
else
|
2016-07-30 00:44:44 +08:00
|
|
|
LoopVal = Phi.getOperand(i).getReg();
|
|
|
|
|
|
|
|
assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
|
|
|
|
}
|
|
|
|
|
2018-01-17 20:29:38 +08:00
|
|
|
/// Return the Phi register value that comes the loop block.
|
2016-07-30 00:44:44 +08:00
|
|
|
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
|
|
|
|
for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
|
|
|
|
if (Phi.getOperand(i + 1).getMBB() == LoopBB)
|
|
|
|
return Phi.getOperand(i).getReg();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if SUb can be reached from SUa following the chain edges.
|
|
|
|
static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
|
|
|
|
SmallPtrSet<SUnit *, 8> Visited;
|
|
|
|
SmallVector<SUnit *, 8> Worklist;
|
|
|
|
Worklist.push_back(SUa);
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
const SUnit *SU = Worklist.pop_back_val();
|
|
|
|
for (auto &SI : SU->Succs) {
|
|
|
|
SUnit *SuccSU = SI.getSUnit();
|
|
|
|
if (SI.getKind() == SDep::Order) {
|
|
|
|
if (Visited.count(SuccSU))
|
|
|
|
continue;
|
|
|
|
if (SuccSU == SUb)
|
|
|
|
return true;
|
|
|
|
Worklist.push_back(SuccSU);
|
|
|
|
Visited.insert(SuccSU);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if the instruction causes a chain between memory
|
|
|
|
/// references before and after it.
|
|
|
|
static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
|
2019-06-06 06:33:10 +08:00
|
|
|
return MI.isCall() || MI.mayRaiseFPException() ||
|
|
|
|
MI.hasUnmodeledSideEffects() ||
|
2016-07-30 00:44:44 +08:00
|
|
|
(MI.hasOrderedMemoryRef() &&
|
2016-09-10 09:03:20 +08:00
|
|
|
(!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the underlying objects for the memory references of an instruction.
|
|
|
|
/// This function calls the code in ValueTracking, but first checks that the
|
|
|
|
/// instruction has a memory operand.
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 14:55:50 +08:00
|
|
|
static void getUnderlyingObjects(const MachineInstr *MI,
|
2020-07-31 17:09:54 +08:00
|
|
|
SmallVectorImpl<const Value *> &Objs) {
|
2016-07-30 00:44:44 +08:00
|
|
|
if (!MI->hasOneMemOperand())
|
|
|
|
return;
|
|
|
|
MachineMemOperand *MM = *MI->memoperands_begin();
|
|
|
|
if (!MM->getValue())
|
|
|
|
return;
|
2020-07-31 17:09:54 +08:00
|
|
|
getUnderlyingObjects(MM->getValue(), Objs);
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 14:55:50 +08:00
|
|
|
for (const Value *V : Objs) {
|
2018-03-27 00:50:11 +08:00
|
|
|
if (!isIdentifiedObject(V)) {
|
|
|
|
Objs.clear();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
Objs.push_back(V);
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Add a chain edge between a load and store if the store can be an
|
|
|
|
/// alias of the load on a subsequent iteration, i.e., a loop carried
|
|
|
|
/// dependence. This code is very similar to the code in ScheduleDAGInstrs
|
|
|
|
/// but that code doesn't create loop carried dependences.
|
|
|
|
void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 14:55:50 +08:00
|
|
|
MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
|
2018-03-27 00:50:11 +08:00
|
|
|
Value *UnknownValue =
|
|
|
|
UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
|
2016-07-30 00:44:44 +08:00
|
|
|
for (auto &SU : SUnits) {
|
|
|
|
MachineInstr &MI = *SU.getInstr();
|
|
|
|
if (isDependenceBarrier(MI, AA))
|
|
|
|
PendingLoads.clear();
|
|
|
|
else if (MI.mayLoad()) {
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 14:55:50 +08:00
|
|
|
SmallVector<const Value *, 4> Objs;
|
2020-07-31 17:09:54 +08:00
|
|
|
::getUnderlyingObjects(&MI, Objs);
|
2018-03-27 00:50:11 +08:00
|
|
|
if (Objs.empty())
|
|
|
|
Objs.push_back(UnknownValue);
|
2016-07-30 00:44:44 +08:00
|
|
|
for (auto V : Objs) {
|
|
|
|
SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
|
|
|
|
SUs.push_back(&SU);
|
|
|
|
}
|
|
|
|
} else if (MI.mayStore()) {
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 14:55:50 +08:00
|
|
|
SmallVector<const Value *, 4> Objs;
|
2020-07-31 17:09:54 +08:00
|
|
|
::getUnderlyingObjects(&MI, Objs);
|
2018-03-27 00:50:11 +08:00
|
|
|
if (Objs.empty())
|
|
|
|
Objs.push_back(UnknownValue);
|
2016-07-30 00:44:44 +08:00
|
|
|
for (auto V : Objs) {
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 14:55:50 +08:00
|
|
|
MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
|
2016-07-30 00:44:44 +08:00
|
|
|
PendingLoads.find(V);
|
|
|
|
if (I == PendingLoads.end())
|
|
|
|
continue;
|
|
|
|
for (auto Load : I->second) {
|
|
|
|
if (isSuccOrder(Load, &SU))
|
|
|
|
continue;
|
|
|
|
MachineInstr &LdMI = *Load->getInstr();
|
|
|
|
// First, perform the cheaper check that compares the base register.
|
|
|
|
// If they are the same and the load offset is less than the store
|
|
|
|
// offset, then mark the dependence as loop carried potentially.
|
2019-04-19 17:08:38 +08:00
|
|
|
const MachineOperand *BaseOp1, *BaseOp2;
|
2016-07-30 00:44:44 +08:00
|
|
|
int64_t Offset1, Offset2;
|
Add OffsetIsScalable to getMemOperandWithOffset
Summary:
Making `Scale` a `TypeSize` in AArch64InstrInfo::getMemOpInfo,
has the effect that all places where this information is used
(notably, TargetInstrInfo::getMemOperandWithOffset) will need
to consider Scale - and derived, Offset - possibly being scalable.
This patch adds a new operand `bool &OffsetIsScalable` to
TargetInstrInfo::getMemOperandWithOffset and fixes up all
the places where this function is used, to consider the
offset possibly being scalable.
In most cases, this means bailing out because the algorithm does not
(or cannot) support scalable offsets in places where it does some
form of alias checking for example.
Reviewers: rovka, efriedma, kristof.beyls
Reviewed By: efriedma
Subscribers: wuzish, kerbowa, MatzeB, arsenm, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, javed.absar, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72758
2020-02-18 22:32:26 +08:00
|
|
|
bool Offset1IsScalable, Offset2IsScalable;
|
|
|
|
if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
|
|
|
|
Offset1IsScalable, TRI) &&
|
|
|
|
TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
|
|
|
|
Offset2IsScalable, TRI)) {
|
2018-11-28 20:00:20 +08:00
|
|
|
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
|
Add OffsetIsScalable to getMemOperandWithOffset
Summary:
Making `Scale` a `TypeSize` in AArch64InstrInfo::getMemOpInfo,
has the effect that all places where this information is used
(notably, TargetInstrInfo::getMemOperandWithOffset) will need
to consider Scale - and derived, Offset - possibly being scalable.
This patch adds a new operand `bool &OffsetIsScalable` to
TargetInstrInfo::getMemOperandWithOffset and fixes up all
the places where this function is used, to consider the
offset possibly being scalable.
In most cases, this means bailing out because the algorithm does not
(or cannot) support scalable offsets in places where it does some
form of alias checking for example.
Reviewers: rovka, efriedma, kristof.beyls
Reviewed By: efriedma
Subscribers: wuzish, kerbowa, MatzeB, arsenm, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, javed.absar, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72758
2020-02-18 22:32:26 +08:00
|
|
|
Offset1IsScalable == Offset2IsScalable &&
|
2018-11-28 20:00:20 +08:00
|
|
|
(int)Offset1 < (int)Offset2) {
|
2019-09-27 06:53:44 +08:00
|
|
|
assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
|
2018-03-27 00:50:11 +08:00
|
|
|
"What happened to the chain edge?");
|
|
|
|
SDep Dep(Load, SDep::Barrier);
|
|
|
|
Dep.setLatency(1);
|
|
|
|
SU.addPred(Dep);
|
|
|
|
continue;
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
// Second, the more expensive check that uses alias analysis on the
|
|
|
|
// base registers. If they alias, and the load offset is less than
|
|
|
|
// the store offset, the mark the dependence as loop carried.
|
|
|
|
if (!AA) {
|
2018-03-22 00:39:11 +08:00
|
|
|
SDep Dep(Load, SDep::Barrier);
|
|
|
|
Dep.setLatency(1);
|
|
|
|
SU.addPred(Dep);
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
|
|
|
|
MachineMemOperand *MMO2 = *MI.memoperands_begin();
|
|
|
|
if (!MMO1->getValue() || !MMO2->getValue()) {
|
2018-03-22 00:39:11 +08:00
|
|
|
SDep Dep(Load, SDep::Barrier);
|
|
|
|
Dep.setLatency(1);
|
|
|
|
SU.addPred(Dep);
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (MMO1->getValue() == MMO2->getValue() &&
|
|
|
|
MMO1->getOffset() <= MMO2->getOffset()) {
|
2018-03-22 00:39:11 +08:00
|
|
|
SDep Dep(Load, SDep::Barrier);
|
|
|
|
Dep.setLatency(1);
|
|
|
|
SU.addPred(Dep);
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
AliasResult AAResult = AA->alias(
|
2018-10-11 05:28:44 +08:00
|
|
|
MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
|
2016-07-30 00:44:44 +08:00
|
|
|
MMO1->getAAInfo()),
|
2018-10-11 05:28:44 +08:00
|
|
|
MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
|
2016-07-30 00:44:44 +08:00
|
|
|
MMO2->getAAInfo()));
|
|
|
|
|
2018-03-22 00:39:11 +08:00
|
|
|
if (AAResult != NoAlias) {
|
|
|
|
SDep Dep(Load, SDep::Barrier);
|
|
|
|
Dep.setLatency(1);
|
|
|
|
SU.addPred(Dep);
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
|
|
|
|
/// processes dependences for PHIs. This function adds true dependences
|
|
|
|
/// from a PHI to a use, and a loop carried dependence from the use to the
|
|
|
|
/// PHI. The loop carried dependence is represented as an anti dependence
|
|
|
|
/// edge. This function also removes chain dependences between unrelated
|
|
|
|
/// PHIs.
|
|
|
|
void SwingSchedulerDAG::updatePhiDependences() {
|
|
|
|
SmallVector<SDep, 4> RemoveDeps;
|
|
|
|
const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
|
|
|
|
|
|
|
|
// Iterate over each DAG node.
|
|
|
|
for (SUnit &I : SUnits) {
|
|
|
|
RemoveDeps.clear();
|
|
|
|
// Set to true if the instruction has an operand defined by a Phi.
|
|
|
|
unsigned HasPhiUse = 0;
|
|
|
|
unsigned HasPhiDef = 0;
|
|
|
|
MachineInstr *MI = I.getInstr();
|
|
|
|
// Iterate over each operand, and we process the definitions.
|
|
|
|
for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
|
|
|
|
MOE = MI->operands_end();
|
|
|
|
MOI != MOE; ++MOI) {
|
|
|
|
if (!MOI->isReg())
|
|
|
|
continue;
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register Reg = MOI->getReg();
|
2016-07-30 00:44:44 +08:00
|
|
|
if (MOI->isDef()) {
|
|
|
|
// If the register is used by a Phi, then create an anti dependence.
|
|
|
|
for (MachineRegisterInfo::use_instr_iterator
|
|
|
|
UI = MRI.use_instr_begin(Reg),
|
|
|
|
UE = MRI.use_instr_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
MachineInstr *UseMI = &*UI;
|
|
|
|
SUnit *SU = getSUnit(UseMI);
|
2016-08-12 01:20:18 +08:00
|
|
|
if (SU != nullptr && UseMI->isPHI()) {
|
2016-07-30 00:44:44 +08:00
|
|
|
if (!MI->isPHI()) {
|
|
|
|
SDep Dep(SU, SDep::Anti, Reg);
|
2018-03-22 00:39:11 +08:00
|
|
|
Dep.setLatency(1);
|
2016-07-30 00:44:44 +08:00
|
|
|
I.addPred(Dep);
|
|
|
|
} else {
|
|
|
|
HasPhiDef = Reg;
|
|
|
|
// Add a chain edge to a dependent Phi that isn't an existing
|
|
|
|
// predecessor.
|
|
|
|
if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
|
|
|
|
I.addPred(SDep(SU, SDep::Barrier));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (MOI->isUse()) {
|
|
|
|
// If the register is defined by a Phi, then create a true dependence.
|
|
|
|
MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
|
2016-08-12 01:20:18 +08:00
|
|
|
if (DefMI == nullptr)
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
|
|
|
SUnit *SU = getSUnit(DefMI);
|
2016-08-12 01:20:18 +08:00
|
|
|
if (SU != nullptr && DefMI->isPHI()) {
|
2016-07-30 00:44:44 +08:00
|
|
|
if (!MI->isPHI()) {
|
|
|
|
SDep Dep(SU, SDep::Data, Reg);
|
|
|
|
Dep.setLatency(0);
|
2020-03-31 18:57:51 +08:00
|
|
|
ST.adjustSchedDependency(SU, 0, &I, MI->getOperandNo(MOI), Dep);
|
2016-07-30 00:44:44 +08:00
|
|
|
I.addPred(Dep);
|
|
|
|
} else {
|
|
|
|
HasPhiUse = Reg;
|
|
|
|
// Add a chain edge to a dependent Phi that isn't an existing
|
|
|
|
// predecessor.
|
|
|
|
if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
|
|
|
|
I.addPred(SDep(SU, SDep::Barrier));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Remove order dependences from an unrelated Phi.
|
|
|
|
if (!SwpPruneDeps)
|
|
|
|
continue;
|
|
|
|
for (auto &PI : I.Preds) {
|
|
|
|
MachineInstr *PMI = PI.getSUnit()->getInstr();
|
|
|
|
if (PMI->isPHI() && PI.getKind() == SDep::Order) {
|
|
|
|
if (I.getInstr()->isPHI()) {
|
|
|
|
if (PMI->getOperand(0).getReg() == HasPhiUse)
|
|
|
|
continue;
|
|
|
|
if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
RemoveDeps.push_back(PI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
|
|
|
|
I.removePred(RemoveDeps[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Iterate over each DAG node and see if we can change any dependences
|
|
|
|
/// in order to reduce the recurrence MII.
|
|
|
|
void SwingSchedulerDAG::changeDependences() {
|
|
|
|
// See if an instruction can use a value from the previous iteration.
|
|
|
|
// If so, we update the base and offset of the instruction and change
|
|
|
|
// the dependences.
|
|
|
|
for (SUnit &I : SUnits) {
|
|
|
|
unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
|
|
|
|
int64_t NewOffset = 0;
|
|
|
|
if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
|
|
|
|
NewOffset))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Get the MI and SUnit for the instruction that defines the original base.
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
|
2016-07-30 00:44:44 +08:00
|
|
|
MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
|
|
|
|
if (!DefMI)
|
|
|
|
continue;
|
|
|
|
SUnit *DefSU = getSUnit(DefMI);
|
|
|
|
if (!DefSU)
|
|
|
|
continue;
|
|
|
|
// Get the MI and SUnit for the instruction that defins the new base.
|
|
|
|
MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
|
|
|
|
if (!LastMI)
|
|
|
|
continue;
|
|
|
|
SUnit *LastSU = getSUnit(LastMI);
|
|
|
|
if (!LastSU)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (Topo.IsReachable(&I, LastSU))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Remove the dependence. The value now depends on a prior iteration.
|
|
|
|
SmallVector<SDep, 4> Deps;
|
|
|
|
for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
|
|
|
|
++P)
|
|
|
|
if (P->getSUnit() == DefSU)
|
|
|
|
Deps.push_back(*P);
|
|
|
|
for (int i = 0, e = Deps.size(); i != e; i++) {
|
|
|
|
Topo.RemovePred(&I, Deps[i].getSUnit());
|
|
|
|
I.removePred(Deps[i]);
|
|
|
|
}
|
|
|
|
// Remove the chain dependence between the instructions.
|
|
|
|
Deps.clear();
|
|
|
|
for (auto &P : LastSU->Preds)
|
|
|
|
if (P.getSUnit() == &I && P.getKind() == SDep::Order)
|
|
|
|
Deps.push_back(P);
|
|
|
|
for (int i = 0, e = Deps.size(); i != e; i++) {
|
|
|
|
Topo.RemovePred(LastSU, Deps[i].getSUnit());
|
|
|
|
LastSU->removePred(Deps[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add a dependence between the new instruction and the instruction
|
|
|
|
// that defines the new base.
|
|
|
|
SDep Dep(&I, SDep::Anti, NewBase);
|
2018-10-12 03:42:46 +08:00
|
|
|
Topo.AddPred(LastSU, &I);
|
2016-07-30 00:44:44 +08:00
|
|
|
LastSU->addPred(Dep);
|
|
|
|
|
|
|
|
// Remember the base and offset information so that we can update the
|
|
|
|
// instruction during code generation.
|
|
|
|
InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
2016-08-12 01:20:18 +08:00
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
// FuncUnitSorter - Comparison operator used to sort instructions by
|
|
|
|
// the number of functional unit choices.
|
|
|
|
struct FuncUnitSorter {
|
|
|
|
const InstrItineraryData *InstrItins;
|
2019-05-29 11:02:59 +08:00
|
|
|
const MCSubtargetInfo *STI;
|
2019-12-09 23:22:57 +08:00
|
|
|
DenseMap<InstrStage::FuncUnits, unsigned> Resources;
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2019-05-29 11:02:59 +08:00
|
|
|
FuncUnitSorter(const TargetSubtargetInfo &TSI)
|
|
|
|
: InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
|
2017-09-12 07:00:48 +08:00
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
// Compute the number of functional unit alternatives needed
|
|
|
|
// at each stage, and take the minimum value. We prioritize the
|
|
|
|
// instructions by the least number of choices first.
|
2019-12-09 23:22:57 +08:00
|
|
|
unsigned minFuncUnits(const MachineInstr *Inst,
|
|
|
|
InstrStage::FuncUnits &F) const {
|
2019-05-29 11:02:59 +08:00
|
|
|
unsigned SchedClass = Inst->getDesc().getSchedClass();
|
2016-07-30 00:44:44 +08:00
|
|
|
unsigned min = UINT_MAX;
|
2019-05-29 11:02:59 +08:00
|
|
|
if (InstrItins && !InstrItins->isEmpty()) {
|
|
|
|
for (const InstrStage &IS :
|
|
|
|
make_range(InstrItins->beginStage(SchedClass),
|
|
|
|
InstrItins->endStage(SchedClass))) {
|
2019-12-09 23:22:57 +08:00
|
|
|
InstrStage::FuncUnits funcUnits = IS.getUnits();
|
2019-05-29 11:02:59 +08:00
|
|
|
unsigned numAlternatives = countPopulation(funcUnits);
|
|
|
|
if (numAlternatives < min) {
|
|
|
|
min = numAlternatives;
|
|
|
|
F = funcUnits;
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
2019-05-29 11:02:59 +08:00
|
|
|
return min;
|
|
|
|
}
|
|
|
|
if (STI && STI->getSchedModel().hasInstrSchedModel()) {
|
|
|
|
const MCSchedClassDesc *SCDesc =
|
|
|
|
STI->getSchedModel().getSchedClassDesc(SchedClass);
|
|
|
|
if (!SCDesc->isValid())
|
|
|
|
// No valid Schedule Class Desc for schedClass, should be
|
|
|
|
// Pseudo/PostRAPseudo
|
|
|
|
return min;
|
|
|
|
|
|
|
|
for (const MCWriteProcResEntry &PRE :
|
|
|
|
make_range(STI->getWriteProcResBegin(SCDesc),
|
|
|
|
STI->getWriteProcResEnd(SCDesc))) {
|
|
|
|
if (!PRE.Cycles)
|
|
|
|
continue;
|
|
|
|
const MCProcResourceDesc *ProcResource =
|
|
|
|
STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
|
|
|
|
unsigned NumUnits = ProcResource->NumUnits;
|
|
|
|
if (NumUnits < min) {
|
|
|
|
min = NumUnits;
|
|
|
|
F = PRE.ProcResourceIdx;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return min;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
2019-05-29 11:02:59 +08:00
|
|
|
llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Compute the critical resources needed by the instruction. This
|
|
|
|
// function records the functional units needed by instructions that
|
|
|
|
// must use only one functional unit. We use this as a tie breaker
|
|
|
|
// for computing the resource MII. The instrutions that require
|
|
|
|
// the same, highly used, functional unit have high priority.
|
|
|
|
void calcCriticalResources(MachineInstr &MI) {
|
|
|
|
unsigned SchedClass = MI.getDesc().getSchedClass();
|
2019-05-29 11:02:59 +08:00
|
|
|
if (InstrItins && !InstrItins->isEmpty()) {
|
|
|
|
for (const InstrStage &IS :
|
|
|
|
make_range(InstrItins->beginStage(SchedClass),
|
|
|
|
InstrItins->endStage(SchedClass))) {
|
2019-12-09 23:22:57 +08:00
|
|
|
InstrStage::FuncUnits FuncUnits = IS.getUnits();
|
2019-05-29 11:02:59 +08:00
|
|
|
if (countPopulation(FuncUnits) == 1)
|
|
|
|
Resources[FuncUnits]++;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (STI && STI->getSchedModel().hasInstrSchedModel()) {
|
|
|
|
const MCSchedClassDesc *SCDesc =
|
|
|
|
STI->getSchedModel().getSchedClassDesc(SchedClass);
|
|
|
|
if (!SCDesc->isValid())
|
|
|
|
// No valid Schedule Class Desc for schedClass, should be
|
|
|
|
// Pseudo/PostRAPseudo
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (const MCWriteProcResEntry &PRE :
|
|
|
|
make_range(STI->getWriteProcResBegin(SCDesc),
|
|
|
|
STI->getWriteProcResEnd(SCDesc))) {
|
|
|
|
if (!PRE.Cycles)
|
|
|
|
continue;
|
|
|
|
Resources[PRE.ProcResourceIdx]++;
|
|
|
|
}
|
|
|
|
return;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
2019-05-29 11:02:59 +08:00
|
|
|
llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if IS1 has less priority than IS2.
|
|
|
|
bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
|
2019-12-09 23:22:57 +08:00
|
|
|
InstrStage::FuncUnits F1 = 0, F2 = 0;
|
2016-07-30 00:44:44 +08:00
|
|
|
unsigned MFUs1 = minFuncUnits(IS1, F1);
|
|
|
|
unsigned MFUs2 = minFuncUnits(IS2, F2);
|
[MachinePipeliner] Avoid indeterminate order in FuncUnitSorter
Summary:
This is exposed by adding a new testcase in PowerPC in
https://reviews.llvm.org/rL367732
The testcase got different output on different platform, hence breaking
buildbots.
The problem is that we get differnt FuncUnitOrder when calculateResMII.
The root cause is:
1. Two MachineInstr might get SAME priority(MFUsx) from minFuncUnits.
2. Current comparison operator() will return `MFUs1 > MFUs2`.
3. We use iterators for MachineInstr, so the input to FuncUnitSorter
might be different on differnt platform due to the iterator nature.
So for two MI with same MFU, their order is actually depends on the
iterator order, which is platform (implemtation) dependent.
This is risky, and may cause cross-compiling problems.
The fix is to check make sure we assign a determine order when they are
equal.
Reviewers: bcahoon, hfinkel, jmolloy
Subscribers: nemanjai, hiraditya, MaskRay, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65992
llvm-svn: 368441
2019-08-09 22:10:57 +08:00
|
|
|
if (MFUs1 == MFUs2)
|
2016-07-30 00:44:44 +08:00
|
|
|
return Resources.lookup(F1) < Resources.lookup(F2);
|
|
|
|
return MFUs1 > MFUs2;
|
|
|
|
}
|
|
|
|
};
|
2016-08-12 01:20:18 +08:00
|
|
|
|
|
|
|
} // end anonymous namespace
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
/// Calculate the resource constrained minimum initiation interval for the
|
|
|
|
/// specified loop. We use the DFA to model the resources needed for
|
|
|
|
/// each instruction, and we ignore dependences. A different DFA is created
|
|
|
|
/// for each cycle that is required. When adding a new instruction, we attempt
|
|
|
|
/// to add it to each existing DFA, until a legal space is found. If the
|
|
|
|
/// instruction cannot be reserved in an existing DFA, we create a new one.
|
|
|
|
unsigned SwingSchedulerDAG::calculateResMII() {
|
2019-05-29 11:02:59 +08:00
|
|
|
|
2019-05-31 23:35:19 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "calculateResMII:\n");
|
2019-05-29 11:02:59 +08:00
|
|
|
SmallVector<ResourceManager*, 8> Resources;
|
2016-07-30 00:44:44 +08:00
|
|
|
MachineBasicBlock *MBB = Loop.getHeader();
|
2019-05-29 11:02:59 +08:00
|
|
|
Resources.push_back(new ResourceManager(&MF.getSubtarget()));
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
// Sort the instructions by the number of available choices for scheduling,
|
|
|
|
// least to most. Use the number of critical resources as the tie breaker.
|
2019-05-29 11:02:59 +08:00
|
|
|
FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());
|
2016-07-30 00:44:44 +08:00
|
|
|
for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
|
|
|
|
E = MBB->getFirstTerminator();
|
|
|
|
I != E; ++I)
|
|
|
|
FUS.calcCriticalResources(*I);
|
|
|
|
PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
|
|
|
|
FuncUnitOrder(FUS);
|
|
|
|
|
|
|
|
for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
|
|
|
|
E = MBB->getFirstTerminator();
|
|
|
|
I != E; ++I)
|
|
|
|
FuncUnitOrder.push(&*I);
|
|
|
|
|
|
|
|
while (!FuncUnitOrder.empty()) {
|
|
|
|
MachineInstr *MI = FuncUnitOrder.top();
|
|
|
|
FuncUnitOrder.pop();
|
|
|
|
if (TII->isZeroCost(MI->getOpcode()))
|
|
|
|
continue;
|
|
|
|
// Attempt to reserve the instruction in an existing DFA. At least one
|
|
|
|
// DFA is needed for each cycle.
|
|
|
|
unsigned NumCycles = getSUnit(MI)->Latency;
|
|
|
|
unsigned ReservedCycles = 0;
|
2019-05-29 11:02:59 +08:00
|
|
|
SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin();
|
|
|
|
SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end();
|
2019-05-31 23:35:19 +08:00
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << "Trying to reserve resource for " << NumCycles
|
|
|
|
<< " cycles for \n";
|
|
|
|
MI->dump();
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
for (unsigned C = 0; C < NumCycles; ++C)
|
|
|
|
while (RI != RE) {
|
2019-06-26 05:50:56 +08:00
|
|
|
if ((*RI)->canReserveResources(*MI)) {
|
|
|
|
(*RI)->reserveResources(*MI);
|
2016-07-30 00:44:44 +08:00
|
|
|
++ReservedCycles;
|
|
|
|
break;
|
|
|
|
}
|
2019-06-26 05:50:56 +08:00
|
|
|
RI++;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
2019-05-31 23:35:19 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
|
|
|
|
<< ", NumCycles:" << NumCycles << "\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
// Add new DFAs, if needed, to reserve resources.
|
|
|
|
for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
|
2019-06-19 04:24:49 +08:00
|
|
|
LLVM_DEBUG(if (SwpDebugResource) dbgs()
|
|
|
|
<< "NewResource created to reserve resources"
|
|
|
|
<< "\n");
|
2019-05-29 11:02:59 +08:00
|
|
|
ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget());
|
2016-07-30 00:44:44 +08:00
|
|
|
assert(NewResource->canReserveResources(*MI) && "Reserve error.");
|
|
|
|
NewResource->reserveResources(*MI);
|
|
|
|
Resources.push_back(NewResource);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int Resmii = Resources.size();
|
2020-05-05 22:27:59 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
// Delete the memory for each of the DFAs that were created earlier.
|
2019-05-29 11:02:59 +08:00
|
|
|
for (ResourceManager *RI : Resources) {
|
|
|
|
ResourceManager *D = RI;
|
2016-07-30 00:44:44 +08:00
|
|
|
delete D;
|
|
|
|
}
|
|
|
|
Resources.clear();
|
|
|
|
return Resmii;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Calculate the recurrence-constrainted minimum initiation interval.
|
|
|
|
/// Iterate over each circuit. Compute the delay(c) and distance(c)
|
|
|
|
/// for each circuit. The II needs to satisfy the inequality
|
|
|
|
/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
|
2018-06-20 13:29:26 +08:00
|
|
|
/// II that satisfies the inequality, and the RecMII is the maximum
|
2016-07-30 00:44:44 +08:00
|
|
|
/// of those values.
|
|
|
|
unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
|
|
|
|
unsigned RecMII = 0;
|
|
|
|
|
|
|
|
for (NodeSet &Nodes : NodeSets) {
|
2017-09-12 07:00:48 +08:00
|
|
|
if (Nodes.empty())
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
|
|
|
|
2018-03-27 00:33:16 +08:00
|
|
|
unsigned Delay = Nodes.getLatency();
|
2016-07-30 00:44:44 +08:00
|
|
|
unsigned Distance = 1;
|
|
|
|
|
|
|
|
// ii = ceil(delay / distance)
|
|
|
|
unsigned CurMII = (Delay + Distance - 1) / Distance;
|
|
|
|
Nodes.setRecMII(CurMII);
|
|
|
|
if (CurMII > RecMII)
|
|
|
|
RecMII = CurMII;
|
|
|
|
}
|
|
|
|
|
|
|
|
return RecMII;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
|
|
|
|
/// but we do this to find the circuits, and then change them back.
|
|
|
|
static void swapAntiDependences(std::vector<SUnit> &SUnits) {
|
|
|
|
SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
|
|
|
|
for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
|
|
|
|
SUnit *SU = &SUnits[i];
|
|
|
|
for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
|
|
|
|
IP != EP; ++IP) {
|
|
|
|
if (IP->getKind() != SDep::Anti)
|
|
|
|
continue;
|
|
|
|
DepsAdded.push_back(std::make_pair(SU, *IP));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
|
|
|
|
E = DepsAdded.end();
|
|
|
|
I != E; ++I) {
|
|
|
|
// Remove this anti dependency and add one in the reverse direction.
|
|
|
|
SUnit *SU = I->first;
|
|
|
|
SDep &D = I->second;
|
|
|
|
SUnit *TargetSU = D.getSUnit();
|
|
|
|
unsigned Reg = D.getReg();
|
|
|
|
unsigned Lat = D.getLatency();
|
|
|
|
SU->removePred(D);
|
|
|
|
SDep Dep(SU, SDep::Anti, Reg);
|
|
|
|
Dep.setLatency(Lat);
|
|
|
|
TargetSU->addPred(Dep);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Create the adjacency structure of the nodes in the graph.
|
|
|
|
void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
|
|
|
|
SwingSchedulerDAG *DAG) {
|
|
|
|
BitVector Added(SUnits.size());
|
2018-03-27 00:05:55 +08:00
|
|
|
DenseMap<int, int> OutputDeps;
|
2016-07-30 00:44:44 +08:00
|
|
|
for (int i = 0, e = SUnits.size(); i != e; ++i) {
|
|
|
|
Added.reset();
|
|
|
|
// Add any successor to the adjacency matrix and exclude duplicates.
|
|
|
|
for (auto &SI : SUnits[i].Succs) {
|
2018-03-27 00:05:55 +08:00
|
|
|
// Only create a back-edge on the first and last nodes of a dependence
|
|
|
|
// chain. This records any chains and adds them later.
|
|
|
|
if (SI.getKind() == SDep::Output) {
|
|
|
|
int N = SI.getSUnit()->NodeNum;
|
|
|
|
int BackEdge = i;
|
|
|
|
auto Dep = OutputDeps.find(BackEdge);
|
|
|
|
if (Dep != OutputDeps.end()) {
|
|
|
|
BackEdge = Dep->second;
|
|
|
|
OutputDeps.erase(Dep);
|
|
|
|
}
|
|
|
|
OutputDeps[N] = BackEdge;
|
|
|
|
}
|
2018-10-26 05:27:08 +08:00
|
|
|
// Do not process a boundary node, an artificial node.
|
|
|
|
// A back-edge is processed only if it goes to a Phi.
|
|
|
|
if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
|
2016-07-30 00:44:44 +08:00
|
|
|
(SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
|
|
|
|
continue;
|
|
|
|
int N = SI.getSUnit()->NodeNum;
|
|
|
|
if (!Added.test(N)) {
|
|
|
|
AdjK[i].push_back(N);
|
|
|
|
Added.set(N);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// A chain edge between a store and a load is treated as a back-edge in the
|
|
|
|
// adjacency matrix.
|
|
|
|
for (auto &PI : SUnits[i].Preds) {
|
|
|
|
if (!SUnits[i].getInstr()->mayStore() ||
|
2018-03-27 00:05:55 +08:00
|
|
|
!DAG->isLoopCarriedDep(&SUnits[i], PI, false))
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
|
|
|
if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
|
|
|
|
int N = PI.getSUnit()->NodeNum;
|
|
|
|
if (!Added.test(N)) {
|
|
|
|
AdjK[i].push_back(N);
|
|
|
|
Added.set(N);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-01-09 13:11:10 +08:00
|
|
|
// Add back-edges in the adjacency matrix for the output dependences.
|
2018-03-27 00:05:55 +08:00
|
|
|
for (auto &OD : OutputDeps)
|
|
|
|
if (!Added.test(OD.second)) {
|
|
|
|
AdjK[OD.first].push_back(OD.second);
|
|
|
|
Added.set(OD.second);
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Identify an elementary circuit in the dependence graph starting at the
|
|
|
|
/// specified node.
|
|
|
|
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
|
|
|
|
bool HasBackedge) {
|
|
|
|
SUnit *SV = &SUnits[V];
|
|
|
|
bool F = false;
|
|
|
|
Stack.insert(SV);
|
|
|
|
Blocked.set(V);
|
|
|
|
|
|
|
|
for (auto W : AdjK[V]) {
|
|
|
|
if (NumPaths > MaxPaths)
|
|
|
|
break;
|
|
|
|
if (W < S)
|
|
|
|
continue;
|
|
|
|
if (W == S) {
|
|
|
|
if (!HasBackedge)
|
|
|
|
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
|
|
|
|
F = true;
|
|
|
|
++NumPaths;
|
|
|
|
break;
|
|
|
|
} else if (!Blocked.test(W)) {
|
2018-10-12 03:45:07 +08:00
|
|
|
if (circuit(W, S, NodeSets,
|
|
|
|
Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
|
2016-07-30 00:44:44 +08:00
|
|
|
F = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (F)
|
|
|
|
unblock(V);
|
|
|
|
else {
|
|
|
|
for (auto W : AdjK[V]) {
|
|
|
|
if (W < S)
|
|
|
|
continue;
|
|
|
|
if (B[W].count(SV) == 0)
|
|
|
|
B[W].insert(SV);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Stack.pop_back();
|
|
|
|
return F;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Unblock a node in the circuit finding algorithm.
|
|
|
|
void SwingSchedulerDAG::Circuits::unblock(int U) {
|
|
|
|
Blocked.reset(U);
|
|
|
|
SmallPtrSet<SUnit *, 4> &BU = B[U];
|
|
|
|
while (!BU.empty()) {
|
|
|
|
SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
|
|
|
|
assert(SI != BU.end() && "Invalid B set.");
|
|
|
|
SUnit *W = *SI;
|
|
|
|
BU.erase(W);
|
|
|
|
if (Blocked.test(W->NodeNum))
|
|
|
|
unblock(W->NodeNum);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Identify all the elementary circuits in the dependence graph using
|
|
|
|
/// Johnson's circuit algorithm.
|
|
|
|
void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
|
|
|
|
// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
|
|
|
|
// but we do this to find the circuits, and then change them back.
|
|
|
|
swapAntiDependences(SUnits);
|
|
|
|
|
2018-10-12 03:45:07 +08:00
|
|
|
Circuits Cir(SUnits, Topo);
|
2016-07-30 00:44:44 +08:00
|
|
|
// Create the adjacency structure.
|
|
|
|
Cir.createAdjacencyStructure(this);
|
|
|
|
for (int i = 0, e = SUnits.size(); i != e; ++i) {
|
|
|
|
Cir.reset();
|
|
|
|
Cir.circuit(i, i, NodeSets);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Change the dependences back so that we've created a DAG again.
|
|
|
|
swapAntiDependences(SUnits);
|
|
|
|
}
|
|
|
|
|
2018-10-18 23:51:16 +08:00
|
|
|
// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
|
|
|
|
// is loop-carried to the USE in next iteration. This will help pipeliner avoid
|
|
|
|
// additional copies that are needed across iterations. An artificial dependence
|
|
|
|
// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
|
|
|
|
|
|
|
|
// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
|
|
|
|
// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
|
|
|
|
// PHI-------True-Dep------> USEOfPhi
|
|
|
|
|
|
|
|
// The mutation creates
|
|
|
|
// USEOfPHI -------Artificial-Dep---> SRCOfCopy
|
|
|
|
|
|
|
|
// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
|
|
|
|
// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
|
|
|
|
// late to avoid additional copies across iterations. The possible scheduling
|
|
|
|
// order would be
|
|
|
|
// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
|
|
|
|
|
|
|
|
void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
|
|
|
|
for (SUnit &SU : DAG->SUnits) {
|
|
|
|
// Find the COPY/REG_SEQUENCE instruction.
|
|
|
|
if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Record the loop carried PHIs.
|
|
|
|
SmallVector<SUnit *, 4> PHISUs;
|
|
|
|
// Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
|
|
|
|
SmallVector<SUnit *, 4> SrcSUs;
|
|
|
|
|
|
|
|
for (auto &Dep : SU.Preds) {
|
|
|
|
SUnit *TmpSU = Dep.getSUnit();
|
|
|
|
MachineInstr *TmpMI = TmpSU->getInstr();
|
|
|
|
SDep::Kind DepKind = Dep.getKind();
|
|
|
|
// Save the loop carried PHI.
|
|
|
|
if (DepKind == SDep::Anti && TmpMI->isPHI())
|
|
|
|
PHISUs.push_back(TmpSU);
|
|
|
|
// Save the source of COPY/REG_SEQUENCE.
|
|
|
|
// If the source has no pre-decessors, we will end up creating cycles.
|
|
|
|
else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
|
|
|
|
SrcSUs.push_back(TmpSU);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PHISUs.size() == 0 || SrcSUs.size() == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
|
|
|
|
// SUnit to the container.
|
|
|
|
SmallVector<SUnit *, 8> UseSUs;
|
2019-11-15 03:08:06 +08:00
|
|
|
// Do not use iterator based loop here as we are updating the container.
|
|
|
|
for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
|
|
|
|
for (auto &Dep : PHISUs[Index]->Succs) {
|
2018-10-18 23:51:16 +08:00
|
|
|
if (Dep.getKind() != SDep::Data)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SUnit *TmpSU = Dep.getSUnit();
|
|
|
|
MachineInstr *TmpMI = TmpSU->getInstr();
|
|
|
|
if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
|
|
|
|
PHISUs.push_back(TmpSU);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
UseSUs.push_back(TmpSU);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UseSUs.size() == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
|
|
|
|
// Add the artificial dependencies if it does not form a cycle.
|
|
|
|
for (auto I : UseSUs) {
|
|
|
|
for (auto Src : SrcSUs) {
|
|
|
|
if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
|
|
|
|
Src->addPred(SDep(I, SDep::Artificial));
|
|
|
|
SDAG->Topo.AddPred(Src, I);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
/// Return true for DAG nodes that we ignore when computing the cost functions.
|
2018-06-20 13:29:26 +08:00
|
|
|
/// We ignore the back-edge recurrence in order to avoid unbounded recursion
|
2016-07-30 00:44:44 +08:00
|
|
|
/// in the calculation of the ASAP, ALAP, etc functions.
|
|
|
|
static bool ignoreDependence(const SDep &D, bool isPred) {
|
|
|
|
if (D.isArtificial())
|
|
|
|
return true;
|
|
|
|
return D.getKind() == SDep::Anti && isPred;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute several functions need to order the nodes for scheduling.
|
|
|
|
/// ASAP - Earliest time to schedule a node.
|
|
|
|
/// ALAP - Latest time to schedule a node.
|
|
|
|
/// MOV - Mobility function, difference between ALAP and ASAP.
|
|
|
|
/// D - Depth of each node.
|
|
|
|
/// H - Height of each node.
|
|
|
|
void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
|
|
|
|
ScheduleInfo.resize(SUnits.size());
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
|
|
|
|
E = Topo.end();
|
|
|
|
I != E; ++I) {
|
2018-09-19 08:23:35 +08:00
|
|
|
const SUnit &SU = SUnits[*I];
|
|
|
|
dumpNode(SU);
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
int maxASAP = 0;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
// Compute ASAP and ZeroLatencyDepth.
|
2016-07-30 00:44:44 +08:00
|
|
|
for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
|
|
|
|
E = Topo.end();
|
|
|
|
I != E; ++I) {
|
|
|
|
int asap = 0;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
int zeroLatencyDepth = 0;
|
2016-07-30 00:44:44 +08:00
|
|
|
SUnit *SU = &SUnits[*I];
|
|
|
|
for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
|
|
|
|
EP = SU->Preds.end();
|
|
|
|
IP != EP; ++IP) {
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
SUnit *pred = IP->getSUnit();
|
2018-03-22 00:39:11 +08:00
|
|
|
if (IP->getLatency() == 0)
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
zeroLatencyDepth =
|
|
|
|
std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
|
2016-07-30 00:44:44 +08:00
|
|
|
if (ignoreDependence(*IP, true))
|
|
|
|
continue;
|
2018-03-22 00:39:11 +08:00
|
|
|
asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
|
2016-07-30 00:44:44 +08:00
|
|
|
getDistance(pred, SU, *IP) * MII));
|
|
|
|
}
|
|
|
|
maxASAP = std::max(maxASAP, asap);
|
|
|
|
ScheduleInfo[*I].ASAP = asap;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
// Compute ALAP, ZeroLatencyHeight, and MOV.
|
2016-07-30 00:44:44 +08:00
|
|
|
for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
|
|
|
|
E = Topo.rend();
|
|
|
|
I != E; ++I) {
|
|
|
|
int alap = maxASAP;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
int zeroLatencyHeight = 0;
|
2016-07-30 00:44:44 +08:00
|
|
|
SUnit *SU = &SUnits[*I];
|
|
|
|
for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
|
|
|
|
ES = SU->Succs.end();
|
|
|
|
IS != ES; ++IS) {
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
SUnit *succ = IS->getSUnit();
|
2018-03-22 00:39:11 +08:00
|
|
|
if (IS->getLatency() == 0)
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
zeroLatencyHeight =
|
|
|
|
std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
|
2016-07-30 00:44:44 +08:00
|
|
|
if (ignoreDependence(*IS, true))
|
|
|
|
continue;
|
2018-03-22 00:39:11 +08:00
|
|
|
alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
|
2016-07-30 00:44:44 +08:00
|
|
|
getDistance(SU, succ, *IS) * MII));
|
|
|
|
}
|
|
|
|
|
|
|
|
ScheduleInfo[*I].ALAP = alap;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// After computing the node functions, compute the summary for each node set.
|
|
|
|
for (NodeSet &I : NodeSets)
|
|
|
|
I.computeNodeSetInfo(this);
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
for (unsigned i = 0; i < SUnits.size(); i++) {
|
|
|
|
dbgs() << "\tNode " << i << ":\n";
|
|
|
|
dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
|
|
|
|
dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
|
|
|
|
dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
|
|
|
|
dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
|
|
|
|
dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
|
|
|
|
dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
|
|
|
|
/// as the predecessors of the elements of NodeOrder that are not also in
|
|
|
|
/// NodeOrder.
|
|
|
|
static bool pred_L(SetVector<SUnit *> &NodeOrder,
|
|
|
|
SmallSetVector<SUnit *, 8> &Preds,
|
|
|
|
const NodeSet *S = nullptr) {
|
|
|
|
Preds.clear();
|
|
|
|
for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
|
|
|
|
I != E; ++I) {
|
|
|
|
for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
|
|
|
|
PI != PE; ++PI) {
|
|
|
|
if (S && S->count(PI->getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (ignoreDependence(*PI, true))
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(PI->getSUnit()) == 0)
|
|
|
|
Preds.insert(PI->getSUnit());
|
|
|
|
}
|
|
|
|
// Back-edges are predecessors with an anti-dependence.
|
|
|
|
for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
|
|
|
|
ES = (*I)->Succs.end();
|
|
|
|
IS != ES; ++IS) {
|
|
|
|
if (IS->getKind() != SDep::Anti)
|
|
|
|
continue;
|
|
|
|
if (S && S->count(IS->getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(IS->getSUnit()) == 0)
|
|
|
|
Preds.insert(IS->getSUnit());
|
|
|
|
}
|
|
|
|
}
|
2017-09-12 07:00:48 +08:00
|
|
|
return !Preds.empty();
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
|
|
|
|
/// as the successors of the elements of NodeOrder that are not also in
|
|
|
|
/// NodeOrder.
|
|
|
|
static bool succ_L(SetVector<SUnit *> &NodeOrder,
|
|
|
|
SmallSetVector<SUnit *, 8> &Succs,
|
|
|
|
const NodeSet *S = nullptr) {
|
|
|
|
Succs.clear();
|
|
|
|
for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
|
|
|
|
I != E; ++I) {
|
|
|
|
for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
|
|
|
|
SI != SE; ++SI) {
|
|
|
|
if (S && S->count(SI->getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (ignoreDependence(*SI, false))
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(SI->getSUnit()) == 0)
|
|
|
|
Succs.insert(SI->getSUnit());
|
|
|
|
}
|
|
|
|
for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
|
|
|
|
PE = (*I)->Preds.end();
|
|
|
|
PI != PE; ++PI) {
|
|
|
|
if (PI->getKind() != SDep::Anti)
|
|
|
|
continue;
|
|
|
|
if (S && S->count(PI->getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(PI->getSUnit()) == 0)
|
|
|
|
Succs.insert(PI->getSUnit());
|
|
|
|
}
|
|
|
|
}
|
2017-09-12 07:00:48 +08:00
|
|
|
return !Succs.empty();
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if there is a path from the specified node to any of the nodes
|
|
|
|
/// in DestNodes. Keep track and return the nodes in any path.
|
|
|
|
static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
|
|
|
|
SetVector<SUnit *> &DestNodes,
|
|
|
|
SetVector<SUnit *> &Exclude,
|
|
|
|
SmallPtrSet<SUnit *, 8> &Visited) {
|
|
|
|
if (Cur->isBoundaryNode())
|
|
|
|
return false;
|
|
|
|
if (Exclude.count(Cur) != 0)
|
|
|
|
return false;
|
|
|
|
if (DestNodes.count(Cur) != 0)
|
|
|
|
return true;
|
|
|
|
if (!Visited.insert(Cur).second)
|
|
|
|
return Path.count(Cur) != 0;
|
|
|
|
bool FoundPath = false;
|
|
|
|
for (auto &SI : Cur->Succs)
|
|
|
|
FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
|
|
|
|
for (auto &PI : Cur->Preds)
|
|
|
|
if (PI.getKind() == SDep::Anti)
|
|
|
|
FoundPath |=
|
|
|
|
computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
|
|
|
|
if (FoundPath)
|
|
|
|
Path.insert(Cur);
|
|
|
|
return FoundPath;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if Set1 is a subset of Set2.
|
|
|
|
template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
|
|
|
|
for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
|
|
|
|
if (Set2.count(*I) == 0)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute the live-out registers for the instructions in a node-set.
|
|
|
|
/// The live-out registers are those that are defined in the node-set,
|
|
|
|
/// but not used. Except for use operands of Phis.
|
|
|
|
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
|
|
|
|
NodeSet &NS) {
|
|
|
|
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
|
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
SmallVector<RegisterMaskPair, 8> LiveOutRegs;
|
|
|
|
SmallSet<unsigned, 4> Uses;
|
|
|
|
for (SUnit *SU : NS) {
|
|
|
|
const MachineInstr *MI = SU->getInstr();
|
|
|
|
if (MI->isPHI())
|
|
|
|
continue;
|
2016-10-25 05:36:43 +08:00
|
|
|
for (const MachineOperand &MO : MI->operands())
|
|
|
|
if (MO.isReg() && MO.isUse()) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register Reg = MO.getReg();
|
2019-08-02 07:27:28 +08:00
|
|
|
if (Register::isVirtualRegister(Reg))
|
2016-07-30 00:44:44 +08:00
|
|
|
Uses.insert(Reg);
|
|
|
|
else if (MRI.isAllocatable(Reg))
|
|
|
|
for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
|
|
|
|
Uses.insert(*Units);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (SUnit *SU : NS)
|
2016-10-25 05:36:43 +08:00
|
|
|
for (const MachineOperand &MO : SU->getInstr()->operands())
|
|
|
|
if (MO.isReg() && MO.isDef() && !MO.isDead()) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register Reg = MO.getReg();
|
2019-08-02 07:27:28 +08:00
|
|
|
if (Register::isVirtualRegister(Reg)) {
|
2016-07-30 00:44:44 +08:00
|
|
|
if (!Uses.count(Reg))
|
2016-12-15 22:36:06 +08:00
|
|
|
LiveOutRegs.push_back(RegisterMaskPair(Reg,
|
|
|
|
LaneBitmask::getNone()));
|
2016-07-30 00:44:44 +08:00
|
|
|
} else if (MRI.isAllocatable(Reg)) {
|
|
|
|
for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
|
|
|
|
if (!Uses.count(*Units))
|
2016-12-15 22:36:06 +08:00
|
|
|
LiveOutRegs.push_back(RegisterMaskPair(*Units,
|
|
|
|
LaneBitmask::getNone()));
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
RPTracker.addLiveRegs(LiveOutRegs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A heuristic to filter nodes in recurrent node-sets if the register
|
|
|
|
/// pressure of a set is too high.
|
|
|
|
void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
|
|
|
|
for (auto &NS : NodeSets) {
|
|
|
|
// Skip small node-sets since they won't cause register pressure problems.
|
|
|
|
if (NS.size() <= 2)
|
|
|
|
continue;
|
|
|
|
IntervalPressure RecRegPressure;
|
|
|
|
RegPressureTracker RecRPTracker(RecRegPressure);
|
|
|
|
RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
|
|
|
|
computeLiveOuts(MF, RecRPTracker, NS);
|
|
|
|
RecRPTracker.closeBottom();
|
|
|
|
|
|
|
|
std::vector<SUnit *> SUnits(NS.begin(), NS.end());
|
llvm::sort(C.begin(), C.end(), ...) -> llvm::sort(C, ...)
Summary: The convenience wrapper in STLExtras is available since rL342102.
Reviewers: dblaikie, javed.absar, JDevlieghere, andreadb
Subscribers: MatzeB, sanjoy, arsenm, dschuff, mehdi_amini, sdardis, nemanjai, jvesely, nhaehnle, sbc100, jgravelle-google, eraman, aheejin, kbarton, JDevlieghere, javed.absar, gbedwell, jrtc27, mgrang, atanasyan, steven_wu, george.burgess.iv, dexonsmith, kristina, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D52573
llvm-svn: 343163
2018-09-27 10:13:45 +08:00
|
|
|
llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
|
2016-07-30 00:44:44 +08:00
|
|
|
return A->NodeNum > B->NodeNum;
|
|
|
|
});
|
|
|
|
|
|
|
|
for (auto &SU : SUnits) {
|
|
|
|
// Since we're computing the register pressure for a subset of the
|
|
|
|
// instructions in a block, we need to set the tracker for each
|
|
|
|
// instruction in the node-set. The tracker is set to the instruction
|
|
|
|
// just after the one we're interested in.
|
|
|
|
MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
|
|
|
|
RecRPTracker.setPos(std::next(CurInstI));
|
|
|
|
|
|
|
|
RegPressureDelta RPDelta;
|
|
|
|
ArrayRef<PressureChange> CriticalPSets;
|
|
|
|
RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
|
|
|
|
CriticalPSets,
|
|
|
|
RecRegPressure.MaxSetPressure);
|
|
|
|
if (RPDelta.Excess.isValid()) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
|
|
|
|
<< TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
|
|
|
|
<< ":" << RPDelta.Excess.getUnitInc());
|
2016-07-30 00:44:44 +08:00
|
|
|
NS.setExceedPressure(SU);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
RecRPTracker.recede();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A heuristic to colocate node sets that have the same set of
|
|
|
|
/// successors.
|
|
|
|
void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
|
|
|
|
unsigned Colocate = 0;
|
|
|
|
for (int i = 0, e = NodeSets.size(); i < e; ++i) {
|
|
|
|
NodeSet &N1 = NodeSets[i];
|
|
|
|
SmallSetVector<SUnit *, 8> S1;
|
|
|
|
if (N1.empty() || !succ_L(N1, S1))
|
|
|
|
continue;
|
|
|
|
for (int j = i + 1; j < e; ++j) {
|
|
|
|
NodeSet &N2 = NodeSets[j];
|
|
|
|
if (N1.compareRecMII(N2) != 0)
|
|
|
|
continue;
|
|
|
|
SmallSetVector<SUnit *, 8> S2;
|
|
|
|
if (N2.empty() || !succ_L(N2, S2))
|
|
|
|
continue;
|
|
|
|
if (isSubset(S1, S2) && S1.size() == S2.size()) {
|
|
|
|
N1.setColocate(++Colocate);
|
|
|
|
N2.setColocate(Colocate);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Check if the existing node-sets are profitable. If not, then ignore the
|
|
|
|
/// recurrent node-sets, and attempt to schedule all nodes together. This is
|
2018-03-27 01:07:41 +08:00
|
|
|
/// a heuristic. If the MII is large and all the recurrent node-sets are small,
|
|
|
|
/// then it's best to try to schedule all instructions together instead of
|
|
|
|
/// starting with the recurrent node-sets.
|
2016-07-30 00:44:44 +08:00
|
|
|
void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
|
|
|
|
// Look for loops with a large MII.
|
2018-03-27 01:07:41 +08:00
|
|
|
if (MII < 17)
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
|
|
|
// Check if the node-set contains only a simple add recurrence.
|
2018-03-27 01:07:41 +08:00
|
|
|
for (auto &NS : NodeSets) {
|
|
|
|
if (NS.getRecMII() > 2)
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2018-03-27 01:07:41 +08:00
|
|
|
if (NS.getMaxDepth() > MII)
|
2016-07-30 00:44:44 +08:00
|
|
|
return;
|
2018-03-27 01:07:41 +08:00
|
|
|
}
|
|
|
|
NodeSets.clear();
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
|
2018-03-27 01:07:41 +08:00
|
|
|
return;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Add the nodes that do not belong to a recurrence set into groups
|
|
|
|
/// based upon connected componenets.
|
|
|
|
void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
|
|
|
|
SetVector<SUnit *> NodesAdded;
|
|
|
|
SmallPtrSet<SUnit *, 8> Visited;
|
|
|
|
// Add the nodes that are on a path between the previous node sets and
|
|
|
|
// the current node set.
|
|
|
|
for (NodeSet &I : NodeSets) {
|
|
|
|
SmallSetVector<SUnit *, 8> N;
|
|
|
|
// Add the nodes from the current node set to the previous node set.
|
|
|
|
if (succ_L(I, N)) {
|
|
|
|
SetVector<SUnit *> Path;
|
|
|
|
for (SUnit *NI : N) {
|
|
|
|
Visited.clear();
|
|
|
|
computePath(NI, Path, NodesAdded, I, Visited);
|
|
|
|
}
|
2017-09-12 07:00:48 +08:00
|
|
|
if (!Path.empty())
|
2016-07-30 00:44:44 +08:00
|
|
|
I.insert(Path.begin(), Path.end());
|
|
|
|
}
|
|
|
|
// Add the nodes from the previous node set to the current node set.
|
|
|
|
N.clear();
|
|
|
|
if (succ_L(NodesAdded, N)) {
|
|
|
|
SetVector<SUnit *> Path;
|
|
|
|
for (SUnit *NI : N) {
|
|
|
|
Visited.clear();
|
|
|
|
computePath(NI, Path, I, NodesAdded, Visited);
|
|
|
|
}
|
2017-09-12 07:00:48 +08:00
|
|
|
if (!Path.empty())
|
2016-07-30 00:44:44 +08:00
|
|
|
I.insert(Path.begin(), Path.end());
|
|
|
|
}
|
|
|
|
NodesAdded.insert(I.begin(), I.end());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a new node set with the connected nodes of any successor of a node
|
|
|
|
// in a recurrent set.
|
|
|
|
NodeSet NewSet;
|
|
|
|
SmallSetVector<SUnit *, 8> N;
|
|
|
|
if (succ_L(NodesAdded, N))
|
|
|
|
for (SUnit *I : N)
|
|
|
|
addConnectedNodes(I, NewSet, NodesAdded);
|
2017-09-12 07:00:48 +08:00
|
|
|
if (!NewSet.empty())
|
2016-07-30 00:44:44 +08:00
|
|
|
NodeSets.push_back(NewSet);
|
|
|
|
|
|
|
|
// Create a new node set with the connected nodes of any predecessor of a node
|
|
|
|
// in a recurrent set.
|
|
|
|
NewSet.clear();
|
|
|
|
if (pred_L(NodesAdded, N))
|
|
|
|
for (SUnit *I : N)
|
|
|
|
addConnectedNodes(I, NewSet, NodesAdded);
|
2017-09-12 07:00:48 +08:00
|
|
|
if (!NewSet.empty())
|
2016-07-30 00:44:44 +08:00
|
|
|
NodeSets.push_back(NewSet);
|
|
|
|
|
2018-04-13 19:37:06 +08:00
|
|
|
// Create new nodes sets with the connected nodes any remaining node that
|
2016-07-30 00:44:44 +08:00
|
|
|
// has no predecessor.
|
|
|
|
for (unsigned i = 0; i < SUnits.size(); ++i) {
|
|
|
|
SUnit *SU = &SUnits[i];
|
|
|
|
if (NodesAdded.count(SU) == 0) {
|
|
|
|
NewSet.clear();
|
|
|
|
addConnectedNodes(SU, NewSet, NodesAdded);
|
2017-09-12 07:00:48 +08:00
|
|
|
if (!NewSet.empty())
|
2016-07-30 00:44:44 +08:00
|
|
|
NodeSets.push_back(NewSet);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-26 05:59:53 +08:00
|
|
|
/// Add the node to the set, and add all of its connected nodes to the set.
|
2016-07-30 00:44:44 +08:00
|
|
|
void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
|
|
|
|
SetVector<SUnit *> &NodesAdded) {
|
|
|
|
NewSet.insert(SU);
|
|
|
|
NodesAdded.insert(SU);
|
|
|
|
for (auto &SI : SU->Succs) {
|
|
|
|
SUnit *Successor = SI.getSUnit();
|
|
|
|
if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
|
|
|
|
addConnectedNodes(Successor, NewSet, NodesAdded);
|
|
|
|
}
|
|
|
|
for (auto &PI : SU->Preds) {
|
|
|
|
SUnit *Predecessor = PI.getSUnit();
|
|
|
|
if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
|
|
|
|
addConnectedNodes(Predecessor, NewSet, NodesAdded);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if Set1 contains elements in Set2. The elements in common
|
|
|
|
/// are returned in a different container.
|
|
|
|
static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
|
|
|
|
SmallSetVector<SUnit *, 8> &Result) {
|
|
|
|
Result.clear();
|
|
|
|
for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
|
|
|
|
SUnit *SU = Set1[i];
|
|
|
|
if (Set2.count(SU) != 0)
|
|
|
|
Result.insert(SU);
|
|
|
|
}
|
|
|
|
return !Result.empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Merge the recurrence node sets that have the same initial node.
|
|
|
|
void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
|
|
|
|
for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
|
|
|
|
++I) {
|
|
|
|
NodeSet &NI = *I;
|
|
|
|
for (NodeSetType::iterator J = I + 1; J != E;) {
|
|
|
|
NodeSet &NJ = *J;
|
|
|
|
if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
|
|
|
|
if (NJ.compareRecMII(NI) > 0)
|
|
|
|
NI.setRecMII(NJ.getRecMII());
|
|
|
|
for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
|
|
|
|
++NII)
|
|
|
|
I->insert(*NII);
|
|
|
|
NodeSets.erase(J);
|
|
|
|
E = NodeSets.end();
|
|
|
|
} else {
|
|
|
|
++J;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Remove nodes that have been scheduled in previous NodeSets.
|
|
|
|
void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
|
|
|
|
for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
|
|
|
|
++I)
|
|
|
|
for (NodeSetType::iterator J = I + 1; J != E;) {
|
|
|
|
J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
|
|
|
|
|
2017-09-12 07:00:48 +08:00
|
|
|
if (J->empty()) {
|
2016-07-30 00:44:44 +08:00
|
|
|
NodeSets.erase(J);
|
|
|
|
E = NodeSets.end();
|
|
|
|
} else {
|
|
|
|
++J;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute an ordered list of the dependence graph nodes, which
|
|
|
|
/// indicates the order that the nodes will be scheduled. This is a
|
|
|
|
/// two-level algorithm. First, a partial order is created, which
|
|
|
|
/// consists of a list of sets ordered from highest to lowest priority.
|
|
|
|
void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
|
|
|
|
SmallSetVector<SUnit *, 8> R;
|
|
|
|
NodeOrder.clear();
|
|
|
|
|
|
|
|
for (auto &Nodes : NodeSets) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
OrderKind Order;
|
|
|
|
SmallSetVector<SUnit *, 8> N;
|
|
|
|
if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
|
|
|
|
R.insert(N.begin(), N.end());
|
|
|
|
Order = BottomUp;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
|
2016-07-30 00:44:44 +08:00
|
|
|
} else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
|
|
|
|
R.insert(N.begin(), N.end());
|
|
|
|
Order = TopDown;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Top down (succs) ");
|
2016-07-30 00:44:44 +08:00
|
|
|
} else if (isIntersect(N, Nodes, R)) {
|
|
|
|
// If some of the successors are in the existing node-set, then use the
|
|
|
|
// top-down ordering.
|
|
|
|
Order = TopDown;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Top down (intersect) ");
|
2016-07-30 00:44:44 +08:00
|
|
|
} else if (NodeSets.size() == 1) {
|
|
|
|
for (auto &N : Nodes)
|
|
|
|
if (N->Succs.size() == 0)
|
|
|
|
R.insert(N);
|
|
|
|
Order = BottomUp;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Bottom up (all) ");
|
2016-07-30 00:44:44 +08:00
|
|
|
} else {
|
|
|
|
// Find the node with the highest ASAP.
|
|
|
|
SUnit *maxASAP = nullptr;
|
|
|
|
for (SUnit *SU : Nodes) {
|
2018-03-27 00:33:16 +08:00
|
|
|
if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
|
|
|
|
(getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxASAP = SU;
|
|
|
|
}
|
|
|
|
R.insert(maxASAP);
|
|
|
|
Order = BottomUp;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Bottom up (default) ");
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
while (!R.empty()) {
|
|
|
|
if (Order == TopDown) {
|
|
|
|
// Choose the node with the maximum height. If more than one, choose
|
2018-03-27 00:33:16 +08:00
|
|
|
// the node wiTH the maximum ZeroLatencyHeight. If still more than one,
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
// choose the node with the lowest MOV.
|
2016-07-30 00:44:44 +08:00
|
|
|
while (!R.empty()) {
|
|
|
|
SUnit *maxHeight = nullptr;
|
|
|
|
for (SUnit *I : R) {
|
2016-08-12 01:20:18 +08:00
|
|
|
if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxHeight = I;
|
|
|
|
else if (getHeight(I) == getHeight(maxHeight) &&
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxHeight = I;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
else if (getHeight(I) == getHeight(maxHeight) &&
|
|
|
|
getZeroLatencyHeight(I) ==
|
|
|
|
getZeroLatencyHeight(maxHeight) &&
|
|
|
|
getMOV(I) < getMOV(maxHeight))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxHeight = I;
|
|
|
|
}
|
|
|
|
NodeOrder.insert(maxHeight);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
|
2016-07-30 00:44:44 +08:00
|
|
|
R.remove(maxHeight);
|
|
|
|
for (const auto &I : maxHeight->Succs) {
|
|
|
|
if (Nodes.count(I.getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(I.getSUnit()) != 0)
|
|
|
|
continue;
|
|
|
|
if (ignoreDependence(I, false))
|
|
|
|
continue;
|
|
|
|
R.insert(I.getSUnit());
|
|
|
|
}
|
|
|
|
// Back-edges are predecessors with an anti-dependence.
|
|
|
|
for (const auto &I : maxHeight->Preds) {
|
|
|
|
if (I.getKind() != SDep::Anti)
|
|
|
|
continue;
|
|
|
|
if (Nodes.count(I.getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(I.getSUnit()) != 0)
|
|
|
|
continue;
|
|
|
|
R.insert(I.getSUnit());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Order = BottomUp;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
|
2016-07-30 00:44:44 +08:00
|
|
|
SmallSetVector<SUnit *, 8> N;
|
|
|
|
if (pred_L(NodeOrder, N, &Nodes))
|
|
|
|
R.insert(N.begin(), N.end());
|
|
|
|
} else {
|
|
|
|
// Choose the node with the maximum depth. If more than one, choose
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
// the node with the maximum ZeroLatencyDepth. If still more than one,
|
|
|
|
// choose the node with the lowest MOV.
|
2016-07-30 00:44:44 +08:00
|
|
|
while (!R.empty()) {
|
|
|
|
SUnit *maxDepth = nullptr;
|
|
|
|
for (SUnit *I : R) {
|
2016-08-12 01:20:18 +08:00
|
|
|
if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxDepth = I;
|
|
|
|
else if (getDepth(I) == getDepth(maxDepth) &&
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxDepth = I;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
else if (getDepth(I) == getDepth(maxDepth) &&
|
|
|
|
getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
|
|
|
|
getMOV(I) < getMOV(maxDepth))
|
2016-07-30 00:44:44 +08:00
|
|
|
maxDepth = I;
|
|
|
|
}
|
|
|
|
NodeOrder.insert(maxDepth);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
|
2016-07-30 00:44:44 +08:00
|
|
|
R.remove(maxDepth);
|
|
|
|
if (Nodes.isExceedSU(maxDepth)) {
|
|
|
|
Order = TopDown;
|
|
|
|
R.clear();
|
|
|
|
R.insert(Nodes.getNode(0));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
for (const auto &I : maxDepth->Preds) {
|
|
|
|
if (Nodes.count(I.getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(I.getSUnit()) != 0)
|
|
|
|
continue;
|
|
|
|
R.insert(I.getSUnit());
|
|
|
|
}
|
|
|
|
// Back-edges are predecessors with an anti-dependence.
|
|
|
|
for (const auto &I : maxDepth->Succs) {
|
|
|
|
if (I.getKind() != SDep::Anti)
|
|
|
|
continue;
|
|
|
|
if (Nodes.count(I.getSUnit()) == 0)
|
|
|
|
continue;
|
|
|
|
if (NodeOrder.count(I.getSUnit()) != 0)
|
|
|
|
continue;
|
|
|
|
R.insert(I.getSUnit());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Order = TopDown;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
|
2016-07-30 00:44:44 +08:00
|
|
|
SmallSetVector<SUnit *, 8> N;
|
|
|
|
if (succ_L(NodeOrder, N, &Nodes))
|
|
|
|
R.insert(N.begin(), N.end());
|
|
|
|
}
|
|
|
|
}
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
dbgs() << "Node order: ";
|
|
|
|
for (SUnit *I : NodeOrder)
|
|
|
|
dbgs() << " " << I->NodeNum << " ";
|
|
|
|
dbgs() << "\n";
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Process the nodes in the computed order and create the pipelined schedule
|
|
|
|
/// of the instructions, if possible. Return true if a schedule is found.
|
|
|
|
bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
|
2019-05-31 23:35:19 +08:00
|
|
|
|
|
|
|
if (NodeOrder.empty()){
|
|
|
|
LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
2019-05-31 23:35:19 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
bool scheduleFound = false;
|
2019-01-23 11:26:10 +08:00
|
|
|
unsigned II = 0;
|
2016-07-30 00:44:44 +08:00
|
|
|
// Keep increasing II until a valid schedule is found.
|
2019-01-23 11:26:10 +08:00
|
|
|
for (II = MII; II <= MAX_II && !scheduleFound; ++II) {
|
2016-07-30 00:44:44 +08:00
|
|
|
Schedule.reset();
|
|
|
|
Schedule.setInitiationInterval(II);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
SetVector<SUnit *>::iterator NI = NodeOrder.begin();
|
|
|
|
SetVector<SUnit *>::iterator NE = NodeOrder.end();
|
|
|
|
do {
|
|
|
|
SUnit *SU = *NI;
|
|
|
|
|
|
|
|
// Compute the schedule time for the instruction, which is based
|
|
|
|
// upon the scheduled time for any predecessors/successors.
|
|
|
|
int EarlyStart = INT_MIN;
|
|
|
|
int LateStart = INT_MAX;
|
|
|
|
// These values are set when the size of the schedule window is limited
|
|
|
|
// due to chain dependences.
|
|
|
|
int SchedEnd = INT_MAX;
|
|
|
|
int SchedStart = INT_MIN;
|
|
|
|
Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
|
|
|
|
II, this);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2019-05-31 23:35:19 +08:00
|
|
|
dbgs() << "\n";
|
2016-07-30 00:44:44 +08:00
|
|
|
dbgs() << "Inst (" << SU->NodeNum << ") ";
|
|
|
|
SU->getInstr()->dump();
|
|
|
|
dbgs() << "\n";
|
|
|
|
});
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2019-05-31 23:35:19 +08:00
|
|
|
dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
|
|
|
|
LateStart, SchedEnd, SchedStart);
|
2016-07-30 00:44:44 +08:00
|
|
|
});
|
|
|
|
|
|
|
|
if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
|
|
|
|
SchedStart > LateStart)
|
|
|
|
scheduleFound = false;
|
|
|
|
else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
|
|
|
|
SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
|
|
|
|
scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
|
|
|
|
} else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
|
|
|
|
SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
|
|
|
|
scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
|
|
|
|
} else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
|
|
|
|
SchedEnd =
|
|
|
|
std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
|
|
|
|
// When scheduling a Phi it is better to start at the late cycle and go
|
|
|
|
// backwards. The default order may insert the Phi too far away from
|
|
|
|
// its first dependence.
|
|
|
|
if (SU->getInstr()->isPHI())
|
|
|
|
scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
|
|
|
|
else
|
|
|
|
scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
|
|
|
|
} else {
|
|
|
|
int FirstCycle = Schedule.getFirstCycle();
|
|
|
|
scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
|
|
|
|
FirstCycle + getASAP(SU) + II - 1, II);
|
|
|
|
}
|
|
|
|
// Even if we find a schedule, make sure the schedule doesn't exceed the
|
|
|
|
// allowable number of stages. We keep trying if this happens.
|
|
|
|
if (scheduleFound)
|
|
|
|
if (SwpMaxStages > -1 &&
|
|
|
|
Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
|
|
|
|
scheduleFound = false;
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
if (!scheduleFound)
|
|
|
|
dbgs() << "\tCan't schedule\n";
|
|
|
|
});
|
|
|
|
} while (++NI != NE && scheduleFound);
|
|
|
|
|
|
|
|
// If a schedule is found, check if it is a valid schedule too.
|
|
|
|
if (scheduleFound)
|
|
|
|
scheduleFound = Schedule.isValidSchedule(this);
|
|
|
|
}
|
|
|
|
|
2019-01-23 11:26:10 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
|
|
|
|
<< ")\n");
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2020-05-05 22:27:59 +08:00
|
|
|
if (scheduleFound) {
|
2016-07-30 00:44:44 +08:00
|
|
|
Schedule.finalizeSchedule(this);
|
2020-05-05 22:27:59 +08:00
|
|
|
Pass.ORE->emit([&]() {
|
|
|
|
return MachineOptimizationRemarkAnalysis(
|
|
|
|
DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
|
|
|
|
<< "Schedule found with Initiation Interval: " << ore::NV("II", II)
|
|
|
|
<< ", MaxStageCount: "
|
|
|
|
<< ore::NV("MaxStageCount", Schedule.getMaxStageCount());
|
|
|
|
});
|
|
|
|
} else
|
2016-07-30 00:44:44 +08:00
|
|
|
Schedule.reset();
|
|
|
|
|
|
|
|
return scheduleFound && Schedule.getMaxStageCount() > 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if we can compute the amount the instruction changes
|
|
|
|
/// during each iteration. Set Delta to the amount of the change.
|
|
|
|
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
|
|
|
|
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
|
2019-04-19 17:08:38 +08:00
|
|
|
const MachineOperand *BaseOp;
|
2016-07-30 00:44:44 +08:00
|
|
|
int64_t Offset;
|
Add OffsetIsScalable to getMemOperandWithOffset
Summary:
Making `Scale` a `TypeSize` in AArch64InstrInfo::getMemOpInfo,
has the effect that all places where this information is used
(notably, TargetInstrInfo::getMemOperandWithOffset) will need
to consider Scale - and derived, Offset - possibly being scalable.
This patch adds a new operand `bool &OffsetIsScalable` to
TargetInstrInfo::getMemOperandWithOffset and fixes up all
the places where this function is used, to consider the
offset possibly being scalable.
In most cases, this means bailing out because the algorithm does not
(or cannot) support scalable offsets in places where it does some
form of alias checking for example.
Reviewers: rovka, efriedma, kristof.beyls
Reviewed By: efriedma
Subscribers: wuzish, kerbowa, MatzeB, arsenm, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, javed.absar, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72758
2020-02-18 22:32:26 +08:00
|
|
|
bool OffsetIsScalable;
|
|
|
|
if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// FIXME: This algorithm assumes instructions have fixed-size offsets.
|
|
|
|
if (OffsetIsScalable)
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
2018-11-28 20:00:20 +08:00
|
|
|
if (!BaseOp->isReg())
|
|
|
|
return false;
|
|
|
|
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register BaseReg = BaseOp->getReg();
|
2018-11-28 20:00:20 +08:00
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
// Check if there is a Phi. If so, get the definition in the loop.
|
|
|
|
MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
|
|
|
|
if (BaseDef && BaseDef->isPHI()) {
|
|
|
|
BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
|
|
|
|
BaseDef = MRI.getVRegDef(BaseReg);
|
|
|
|
}
|
|
|
|
if (!BaseDef)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
int D = 0;
|
2016-08-02 01:55:48 +08:00
|
|
|
if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
Delta = D;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Check if we can change the instruction to use an offset value from the
|
|
|
|
/// previous iteration. If so, return true and set the base and offset values
|
|
|
|
/// so that we can rewrite the load, if necessary.
|
|
|
|
/// v1 = Phi(v0, v3)
|
|
|
|
/// v2 = load v1, 0
|
|
|
|
/// v3 = post_store v1, 4, x
|
|
|
|
/// This function enables the load to be rewritten as v2 = load v3, 4.
|
|
|
|
bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
|
|
|
|
unsigned &BasePos,
|
|
|
|
unsigned &OffsetPos,
|
|
|
|
unsigned &NewBase,
|
|
|
|
int64_t &Offset) {
|
|
|
|
// Get the load instruction.
|
2016-08-02 01:55:48 +08:00
|
|
|
if (TII->isPostIncrement(*MI))
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
unsigned BasePosLd, OffsetPosLd;
|
2016-08-02 01:55:48 +08:00
|
|
|
if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register BaseReg = MI->getOperand(BasePosLd).getReg();
|
2016-07-30 00:44:44 +08:00
|
|
|
|
|
|
|
// Look for the Phi instruction.
|
2017-10-11 07:50:49 +08:00
|
|
|
MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
|
2016-07-30 00:44:44 +08:00
|
|
|
MachineInstr *Phi = MRI.getVRegDef(BaseReg);
|
|
|
|
if (!Phi || !Phi->isPHI())
|
|
|
|
return false;
|
|
|
|
// Get the register defined in the loop block.
|
|
|
|
unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
|
|
|
|
if (!PrevReg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Check for the post-increment load/store instruction.
|
|
|
|
MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
|
|
|
|
if (!PrevDef || PrevDef == MI)
|
|
|
|
return false;
|
|
|
|
|
2016-08-02 01:55:48 +08:00
|
|
|
if (!TII->isPostIncrement(*PrevDef))
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned BasePos1 = 0, OffsetPos1 = 0;
|
2016-08-02 01:55:48 +08:00
|
|
|
if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
2018-03-27 00:17:06 +08:00
|
|
|
// Make sure that the instructions do not access the same memory location in
|
|
|
|
// the next iteration.
|
2016-07-30 00:44:44 +08:00
|
|
|
int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
|
|
|
|
int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
|
2018-03-27 00:17:06 +08:00
|
|
|
MachineInstr *NewMI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
|
|
|
|
bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
|
|
|
|
MF.DeleteMachineInstr(NewMI);
|
|
|
|
if (!Disjoint)
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// Set the return value once we determine that we return true.
|
|
|
|
BasePos = BasePosLd;
|
|
|
|
OffsetPos = OffsetPosLd;
|
|
|
|
NewBase = PrevReg;
|
|
|
|
Offset = StoreOffset;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Apply changes to the instruction if needed. The changes are need
|
|
|
|
/// to improve the scheduling and depend up on the final schedule.
|
2017-10-11 23:51:44 +08:00
|
|
|
void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
|
|
|
|
SMSchedule &Schedule) {
|
2016-07-30 00:44:44 +08:00
|
|
|
SUnit *SU = getSUnit(MI);
|
|
|
|
DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
|
|
|
|
InstrChanges.find(SU);
|
|
|
|
if (It != InstrChanges.end()) {
|
|
|
|
std::pair<unsigned, int64_t> RegAndOffset = It->second;
|
|
|
|
unsigned BasePos, OffsetPos;
|
2016-08-02 01:55:48 +08:00
|
|
|
if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
|
2017-10-11 23:51:44 +08:00
|
|
|
return;
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register BaseReg = MI->getOperand(BasePos).getReg();
|
2016-07-30 00:44:44 +08:00
|
|
|
MachineInstr *LoopDef = findDefInLoop(BaseReg);
|
|
|
|
int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
|
|
|
|
int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
|
|
|
|
int BaseStageNum = Schedule.stageScheduled(SU);
|
|
|
|
int BaseCycleNum = Schedule.cycleScheduled(SU);
|
|
|
|
if (BaseStageNum < DefStageNum) {
|
|
|
|
MachineInstr *NewMI = MF.CloneMachineInstr(MI);
|
|
|
|
int OffsetDiff = DefStageNum - BaseStageNum;
|
|
|
|
if (DefCycleNum < BaseCycleNum) {
|
|
|
|
NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
|
|
|
|
if (OffsetDiff > 0)
|
|
|
|
--OffsetDiff;
|
|
|
|
}
|
|
|
|
int64_t NewOffset =
|
|
|
|
MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
|
|
|
|
NewMI->getOperand(OffsetPos).setImm(NewOffset);
|
2017-10-11 23:51:44 +08:00
|
|
|
SU->setInstr(NewMI);
|
|
|
|
MISUnitMap[NewMI] = SU;
|
2019-08-31 02:49:50 +08:00
|
|
|
NewMIs[MI] = NewMI;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-31 02:49:50 +08:00
|
|
|
/// Return the instruction in the loop that defines the register.
|
|
|
|
/// If the definition is a Phi, then follow the Phi operand to
|
|
|
|
/// the instruction in the loop.
|
|
|
|
MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
|
|
|
|
SmallPtrSet<MachineInstr *, 8> Visited;
|
|
|
|
MachineInstr *Def = MRI.getVRegDef(Reg);
|
|
|
|
while (Def->isPHI()) {
|
|
|
|
if (!Visited.insert(Def).second)
|
|
|
|
break;
|
|
|
|
for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
|
|
|
|
if (Def->getOperand(i + 1).getMBB() == BB) {
|
|
|
|
Def = MRI.getVRegDef(Def->getOperand(i).getReg());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Def;
|
|
|
|
}
|
|
|
|
|
2018-03-27 00:05:55 +08:00
|
|
|
/// Return true for an order or output dependence that is loop carried
|
|
|
|
/// potentially. A dependence is loop carried if the destination defines a valu
|
|
|
|
/// that may be used or defined by the source in a subsequent iteration.
|
|
|
|
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
|
|
|
|
bool isSucc) {
|
|
|
|
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
|
|
|
|
Dep.isArtificial())
|
2016-07-30 00:44:44 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!SwpPruneLoopCarried)
|
|
|
|
return true;
|
|
|
|
|
2018-03-27 00:05:55 +08:00
|
|
|
if (Dep.getKind() == SDep::Output)
|
|
|
|
return true;
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
MachineInstr *SI = Source->getInstr();
|
|
|
|
MachineInstr *DI = Dep.getSUnit()->getInstr();
|
|
|
|
if (!isSucc)
|
|
|
|
std::swap(SI, DI);
|
|
|
|
assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
|
|
|
|
|
|
|
|
// Assume ordered loads and stores may have a loop carried dependence.
|
|
|
|
if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
|
2019-06-06 06:33:10 +08:00
|
|
|
SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
|
2016-07-30 00:44:44 +08:00
|
|
|
SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Only chain dependences between a load and store can be loop carried.
|
|
|
|
if (!DI->mayStore() || !SI->mayLoad())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned DeltaS, DeltaD;
|
|
|
|
if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
|
|
|
|
return true;
|
|
|
|
|
2019-04-19 17:08:38 +08:00
|
|
|
const MachineOperand *BaseOpS, *BaseOpD;
|
2016-07-30 00:44:44 +08:00
|
|
|
int64_t OffsetS, OffsetD;
|
Add OffsetIsScalable to getMemOperandWithOffset
Summary:
Making `Scale` a `TypeSize` in AArch64InstrInfo::getMemOpInfo,
has the effect that all places where this information is used
(notably, TargetInstrInfo::getMemOperandWithOffset) will need
to consider Scale - and derived, Offset - possibly being scalable.
This patch adds a new operand `bool &OffsetIsScalable` to
TargetInstrInfo::getMemOperandWithOffset and fixes up all
the places where this function is used, to consider the
offset possibly being scalable.
In most cases, this means bailing out because the algorithm does not
(or cannot) support scalable offsets in places where it does some
form of alias checking for example.
Reviewers: rovka, efriedma, kristof.beyls
Reviewed By: efriedma
Subscribers: wuzish, kerbowa, MatzeB, arsenm, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, javed.absar, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72758
2020-02-18 22:32:26 +08:00
|
|
|
bool OffsetSIsScalable, OffsetDIsScalable;
|
2016-07-30 00:44:44 +08:00
|
|
|
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
|
Add OffsetIsScalable to getMemOperandWithOffset
Summary:
Making `Scale` a `TypeSize` in AArch64InstrInfo::getMemOpInfo,
has the effect that all places where this information is used
(notably, TargetInstrInfo::getMemOperandWithOffset) will need
to consider Scale - and derived, Offset - possibly being scalable.
This patch adds a new operand `bool &OffsetIsScalable` to
TargetInstrInfo::getMemOperandWithOffset and fixes up all
the places where this function is used, to consider the
offset possibly being scalable.
In most cases, this means bailing out because the algorithm does not
(or cannot) support scalable offsets in places where it does some
form of alias checking for example.
Reviewers: rovka, efriedma, kristof.beyls
Reviewed By: efriedma
Subscribers: wuzish, kerbowa, MatzeB, arsenm, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, javed.absar, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72758
2020-02-18 22:32:26 +08:00
|
|
|
if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
|
|
|
|
TRI) ||
|
|
|
|
!TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
|
|
|
|
TRI))
|
2016-07-30 00:44:44 +08:00
|
|
|
return true;
|
|
|
|
|
Add OffsetIsScalable to getMemOperandWithOffset
Summary:
Making `Scale` a `TypeSize` in AArch64InstrInfo::getMemOpInfo,
has the effect that all places where this information is used
(notably, TargetInstrInfo::getMemOperandWithOffset) will need
to consider Scale - and derived, Offset - possibly being scalable.
This patch adds a new operand `bool &OffsetIsScalable` to
TargetInstrInfo::getMemOperandWithOffset and fixes up all
the places where this function is used, to consider the
offset possibly being scalable.
In most cases, this means bailing out because the algorithm does not
(or cannot) support scalable offsets in places where it does some
form of alias checking for example.
Reviewers: rovka, efriedma, kristof.beyls
Reviewed By: efriedma
Subscribers: wuzish, kerbowa, MatzeB, arsenm, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, javed.absar, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, jsji, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72758
2020-02-18 22:32:26 +08:00
|
|
|
assert(!OffsetSIsScalable && !OffsetDIsScalable &&
|
|
|
|
"Expected offsets to be byte offsets");
|
|
|
|
|
2018-11-28 20:00:20 +08:00
|
|
|
if (!BaseOpS->isIdenticalTo(*BaseOpD))
|
2016-07-30 00:44:44 +08:00
|
|
|
return true;
|
|
|
|
|
2018-03-27 00:58:40 +08:00
|
|
|
// Check that the base register is incremented by a constant value for each
|
|
|
|
// iteration.
|
2018-11-28 20:00:20 +08:00
|
|
|
MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg());
|
2018-03-27 00:58:40 +08:00
|
|
|
if (!Def || !Def->isPHI())
|
|
|
|
return true;
|
|
|
|
unsigned InitVal = 0;
|
|
|
|
unsigned LoopVal = 0;
|
|
|
|
getPhiRegs(*Def, BB, InitVal, LoopVal);
|
|
|
|
MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
|
|
|
|
int D = 0;
|
|
|
|
if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
|
|
|
|
return true;
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
|
|
|
|
uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
|
|
|
|
|
|
|
|
// This is the main test, which checks the offset values and the loop
|
|
|
|
// increment value to determine if the accesses may be loop carried.
|
2019-04-12 05:57:51 +08:00
|
|
|
if (AccessSizeS == MemoryLocation::UnknownSize ||
|
|
|
|
AccessSizeD == MemoryLocation::UnknownSize)
|
|
|
|
return true;
|
2016-07-30 00:44:44 +08:00
|
|
|
|
2019-04-12 05:57:51 +08:00
|
|
|
if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
2016-12-23 03:21:20 +08:00
|
|
|
void SwingSchedulerDAG::postprocessDAG() {
|
|
|
|
for (auto &M : Mutations)
|
|
|
|
M->apply(this);
|
|
|
|
}
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
/// Try to schedule the node at the specified StartCycle and continue
|
|
|
|
/// until the node is schedule or the EndCycle is reached. This function
|
|
|
|
/// returns true if the node is scheduled. This routine may search either
|
|
|
|
/// forward or backward for a place to insert the instruction based upon
|
|
|
|
/// the relative values of StartCycle and EndCycle.
|
|
|
|
bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
|
|
|
|
bool forward = true;
|
2019-05-31 23:35:19 +08:00
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << "Trying to insert node between " << StartCycle << " and "
|
|
|
|
<< EndCycle << " II: " << II << "\n";
|
|
|
|
});
|
2016-07-30 00:44:44 +08:00
|
|
|
if (StartCycle > EndCycle)
|
|
|
|
forward = false;
|
|
|
|
|
|
|
|
// The terminating condition depends on the direction.
|
|
|
|
int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
|
|
|
|
for (int curCycle = StartCycle; curCycle != termCycle;
|
|
|
|
forward ? ++curCycle : --curCycle) {
|
|
|
|
|
2019-05-29 11:02:59 +08:00
|
|
|
// Add the already scheduled instructions at the specified cycle to the
|
|
|
|
// DFA.
|
|
|
|
ProcItinResources.clearResources();
|
2016-07-30 00:44:44 +08:00
|
|
|
for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
|
|
|
|
checkCycle <= LastCycle; checkCycle += II) {
|
|
|
|
std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
|
|
|
|
|
|
|
|
for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
|
|
|
|
E = cycleInstrs.end();
|
|
|
|
I != E; ++I) {
|
|
|
|
if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
|
|
|
|
continue;
|
2019-05-29 11:02:59 +08:00
|
|
|
assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) &&
|
2016-07-30 00:44:44 +08:00
|
|
|
"These instructions have already been scheduled.");
|
2019-05-29 11:02:59 +08:00
|
|
|
ProcItinResources.reserveResources(*(*I)->getInstr());
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
|
2019-05-29 11:02:59 +08:00
|
|
|
ProcItinResources.canReserveResources(*SU->getInstr())) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
dbgs() << "\tinsert at cycle " << curCycle << " ";
|
|
|
|
SU->getInstr()->dump();
|
|
|
|
});
|
|
|
|
|
|
|
|
ScheduledInstrs[curCycle].push_back(SU);
|
|
|
|
InstrToCycle.insert(std::make_pair(SU, curCycle));
|
|
|
|
if (curCycle > LastCycle)
|
|
|
|
LastCycle = curCycle;
|
|
|
|
if (curCycle < FirstCycle)
|
|
|
|
FirstCycle = curCycle;
|
|
|
|
return true;
|
|
|
|
}
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2016-07-30 00:44:44 +08:00
|
|
|
dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
|
|
|
|
SU->getInstr()->dump();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the cycle of the earliest scheduled instruction in the chain.
|
|
|
|
int SMSchedule::earliestCycleInChain(const SDep &Dep) {
|
|
|
|
SmallPtrSet<SUnit *, 8> Visited;
|
|
|
|
SmallVector<SDep, 8> Worklist;
|
|
|
|
Worklist.push_back(Dep);
|
|
|
|
int EarlyCycle = INT_MAX;
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
const SDep &Cur = Worklist.pop_back_val();
|
|
|
|
SUnit *PrevSU = Cur.getSUnit();
|
|
|
|
if (Visited.count(PrevSU))
|
|
|
|
continue;
|
|
|
|
std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
|
|
|
|
if (it == InstrToCycle.end())
|
|
|
|
continue;
|
|
|
|
EarlyCycle = std::min(EarlyCycle, it->second);
|
|
|
|
for (const auto &PI : PrevSU->Preds)
|
2020-03-24 22:32:00 +08:00
|
|
|
if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
|
2016-07-30 00:44:44 +08:00
|
|
|
Worklist.push_back(PI);
|
|
|
|
Visited.insert(PrevSU);
|
|
|
|
}
|
|
|
|
return EarlyCycle;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the cycle of the latest scheduled instruction in the chain.
|
|
|
|
int SMSchedule::latestCycleInChain(const SDep &Dep) {
|
|
|
|
SmallPtrSet<SUnit *, 8> Visited;
|
|
|
|
SmallVector<SDep, 8> Worklist;
|
|
|
|
Worklist.push_back(Dep);
|
|
|
|
int LateCycle = INT_MIN;
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
const SDep &Cur = Worklist.pop_back_val();
|
|
|
|
SUnit *SuccSU = Cur.getSUnit();
|
|
|
|
if (Visited.count(SuccSU))
|
|
|
|
continue;
|
|
|
|
std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
|
|
|
|
if (it == InstrToCycle.end())
|
|
|
|
continue;
|
|
|
|
LateCycle = std::max(LateCycle, it->second);
|
|
|
|
for (const auto &SI : SuccSU->Succs)
|
2020-03-24 22:32:00 +08:00
|
|
|
if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
|
2016-07-30 00:44:44 +08:00
|
|
|
Worklist.push_back(SI);
|
|
|
|
Visited.insert(SuccSU);
|
|
|
|
}
|
|
|
|
return LateCycle;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// If an instruction has a use that spans multiple iterations, then
|
|
|
|
/// return true. These instructions are characterized by having a back-ege
|
|
|
|
/// to a Phi, which contains a reference to another Phi.
|
|
|
|
static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
|
|
|
|
for (auto &P : SU->Preds)
|
|
|
|
if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
|
|
|
|
for (auto &S : P.getSUnit()->Succs)
|
2018-03-26 23:53:23 +08:00
|
|
|
if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
|
2016-07-30 00:44:44 +08:00
|
|
|
return P.getSUnit();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute the scheduling start slot for the instruction. The start slot
|
|
|
|
/// depends on any predecessor or successor nodes scheduled already.
|
|
|
|
void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
|
|
|
|
int *MinEnd, int *MaxStart, int II,
|
|
|
|
SwingSchedulerDAG *DAG) {
|
|
|
|
// Iterate over each instruction that has been scheduled already. The start
|
2018-06-20 13:29:26 +08:00
|
|
|
// slot computation depends on whether the previously scheduled instruction
|
2016-07-30 00:44:44 +08:00
|
|
|
// is a predecessor or successor of the specified instruction.
|
|
|
|
for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
|
|
|
|
|
|
|
|
// Iterate over each instruction in the current cycle.
|
|
|
|
for (SUnit *I : getInstructions(cycle)) {
|
|
|
|
// Because we're processing a DAG for the dependences, we recognize
|
|
|
|
// the back-edge in recurrences by anti dependences.
|
|
|
|
for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
|
|
|
|
const SDep &Dep = SU->Preds[i];
|
|
|
|
if (Dep.getSUnit() == I) {
|
|
|
|
if (!DAG->isBackedge(SU, Dep)) {
|
2018-03-22 00:39:11 +08:00
|
|
|
int EarlyStart = cycle + Dep.getLatency() -
|
2016-07-30 00:44:44 +08:00
|
|
|
DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
|
|
|
|
*MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
|
2018-03-27 00:05:55 +08:00
|
|
|
if (DAG->isLoopCarriedDep(SU, Dep, false)) {
|
2016-07-30 00:44:44 +08:00
|
|
|
int End = earliestCycleInChain(Dep) + (II - 1);
|
|
|
|
*MinEnd = std::min(*MinEnd, End);
|
|
|
|
}
|
|
|
|
} else {
|
2018-03-22 00:39:11 +08:00
|
|
|
int LateStart = cycle - Dep.getLatency() +
|
2016-07-30 00:44:44 +08:00
|
|
|
DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
|
|
|
|
*MinLateStart = std::min(*MinLateStart, LateStart);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// For instruction that requires multiple iterations, make sure that
|
|
|
|
// the dependent instruction is not scheduled past the definition.
|
|
|
|
SUnit *BE = multipleIterations(I, DAG);
|
|
|
|
if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
|
|
|
|
!SU->isPred(I))
|
|
|
|
*MinLateStart = std::min(*MinLateStart, cycle);
|
|
|
|
}
|
2018-03-27 00:33:16 +08:00
|
|
|
for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
|
2016-07-30 00:44:44 +08:00
|
|
|
if (SU->Succs[i].getSUnit() == I) {
|
|
|
|
const SDep &Dep = SU->Succs[i];
|
|
|
|
if (!DAG->isBackedge(SU, Dep)) {
|
2018-03-22 00:39:11 +08:00
|
|
|
int LateStart = cycle - Dep.getLatency() +
|
2016-07-30 00:44:44 +08:00
|
|
|
DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
|
|
|
|
*MinLateStart = std::min(*MinLateStart, LateStart);
|
2018-03-27 00:05:55 +08:00
|
|
|
if (DAG->isLoopCarriedDep(SU, Dep)) {
|
2016-07-30 00:44:44 +08:00
|
|
|
int Start = latestCycleInChain(Dep) + 1 - II;
|
|
|
|
*MaxStart = std::max(*MaxStart, Start);
|
|
|
|
}
|
|
|
|
} else {
|
2018-03-22 00:39:11 +08:00
|
|
|
int EarlyStart = cycle + Dep.getLatency() -
|
2016-07-30 00:44:44 +08:00
|
|
|
DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
|
|
|
|
*MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
|
|
|
|
}
|
|
|
|
}
|
2018-03-27 00:33:16 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Order the instructions within a cycle so that the definitions occur
|
|
|
|
/// before the uses. Returns true if the instruction is added to the start
|
|
|
|
/// of the list, or false if added to the end.
|
2018-03-27 00:23:29 +08:00
|
|
|
void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
|
2016-07-30 00:44:44 +08:00
|
|
|
std::deque<SUnit *> &Insts) {
|
|
|
|
MachineInstr *MI = SU->getInstr();
|
|
|
|
bool OrderBeforeUse = false;
|
|
|
|
bool OrderAfterDef = false;
|
|
|
|
bool OrderBeforeDef = false;
|
|
|
|
unsigned MoveDef = 0;
|
|
|
|
unsigned MoveUse = 0;
|
|
|
|
int StageInst1 = stageScheduled(SU);
|
|
|
|
|
|
|
|
unsigned Pos = 0;
|
|
|
|
for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
|
|
|
|
++I, ++Pos) {
|
|
|
|
for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i);
|
2019-08-02 07:27:28 +08:00
|
|
|
if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
|
2016-07-30 00:44:44 +08:00
|
|
|
continue;
|
2018-03-27 00:23:29 +08:00
|
|
|
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register Reg = MO.getReg();
|
2016-07-30 00:44:44 +08:00
|
|
|
unsigned BasePos, OffsetPos;
|
2016-08-02 01:55:48 +08:00
|
|
|
if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
|
2016-07-30 00:44:44 +08:00
|
|
|
if (MI->getOperand(BasePos).getReg() == Reg)
|
|
|
|
if (unsigned NewReg = SSD->getInstrBaseReg(SU))
|
|
|
|
Reg = NewReg;
|
|
|
|
bool Reads, Writes;
|
|
|
|
std::tie(Reads, Writes) =
|
|
|
|
(*I)->getInstr()->readsWritesVirtualRegister(Reg);
|
|
|
|
if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
|
|
|
|
OrderBeforeUse = true;
|
2018-03-27 00:23:29 +08:00
|
|
|
if (MoveUse == 0)
|
|
|
|
MoveUse = Pos;
|
2016-07-30 00:44:44 +08:00
|
|
|
} else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
|
|
|
|
// Add the instruction after the scheduled instruction.
|
|
|
|
OrderAfterDef = true;
|
|
|
|
MoveDef = Pos;
|
|
|
|
} else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
|
|
|
|
if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
|
|
|
|
OrderBeforeUse = true;
|
2018-03-27 00:23:29 +08:00
|
|
|
if (MoveUse == 0)
|
|
|
|
MoveUse = Pos;
|
2016-07-30 00:44:44 +08:00
|
|
|
} else {
|
|
|
|
OrderAfterDef = true;
|
|
|
|
MoveDef = Pos;
|
|
|
|
}
|
|
|
|
} else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
|
|
|
|
OrderBeforeUse = true;
|
2018-03-27 00:23:29 +08:00
|
|
|
if (MoveUse == 0)
|
|
|
|
MoveUse = Pos;
|
2016-07-30 00:44:44 +08:00
|
|
|
if (MoveUse != 0) {
|
|
|
|
OrderAfterDef = true;
|
|
|
|
MoveDef = Pos - 1;
|
|
|
|
}
|
|
|
|
} else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
|
|
|
|
// Add the instruction before the scheduled instruction.
|
|
|
|
OrderBeforeUse = true;
|
2018-03-27 00:23:29 +08:00
|
|
|
if (MoveUse == 0)
|
|
|
|
MoveUse = Pos;
|
2016-07-30 00:44:44 +08:00
|
|
|
} else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
|
|
|
|
isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
|
2018-03-27 00:23:29 +08:00
|
|
|
if (MoveUse == 0) {
|
|
|
|
OrderBeforeDef = true;
|
|
|
|
MoveUse = Pos;
|
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// Check for order dependences between instructions. Make sure the source
|
|
|
|
// is ordered before the destination.
|
2018-03-27 00:05:55 +08:00
|
|
|
for (auto &S : SU->Succs) {
|
|
|
|
if (S.getSUnit() != *I)
|
|
|
|
continue;
|
|
|
|
if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
|
|
|
|
OrderBeforeUse = true;
|
|
|
|
if (Pos < MoveUse)
|
2016-07-30 00:44:44 +08:00
|
|
|
MoveUse = Pos;
|
|
|
|
}
|
2019-07-12 09:59:42 +08:00
|
|
|
// We did not handle HW dependences in previous for loop,
|
|
|
|
// and we normally set Latency = 0 for Anti deps,
|
|
|
|
// so may have nodes in same cycle with Anti denpendent on HW regs.
|
|
|
|
else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
|
|
|
|
OrderBeforeUse = true;
|
|
|
|
if ((MoveUse == 0) || (Pos < MoveUse))
|
|
|
|
MoveUse = Pos;
|
|
|
|
}
|
2018-03-27 00:05:55 +08:00
|
|
|
}
|
|
|
|
for (auto &P : SU->Preds) {
|
|
|
|
if (P.getSUnit() != *I)
|
|
|
|
continue;
|
|
|
|
if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
|
|
|
|
OrderAfterDef = true;
|
|
|
|
MoveDef = Pos;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
2018-03-27 00:05:55 +08:00
|
|
|
}
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// A circular dependence.
|
|
|
|
if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
|
|
|
|
OrderBeforeUse = false;
|
|
|
|
|
|
|
|
// OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
|
|
|
|
// to a loop-carried dependence.
|
|
|
|
if (OrderBeforeDef)
|
|
|
|
OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
|
|
|
|
|
|
|
|
// The uncommon case when the instruction order needs to be updated because
|
|
|
|
// there is both a use and def.
|
|
|
|
if (OrderBeforeUse && OrderAfterDef) {
|
|
|
|
SUnit *UseSU = Insts.at(MoveUse);
|
|
|
|
SUnit *DefSU = Insts.at(MoveDef);
|
|
|
|
if (MoveUse > MoveDef) {
|
|
|
|
Insts.erase(Insts.begin() + MoveUse);
|
|
|
|
Insts.erase(Insts.begin() + MoveDef);
|
|
|
|
} else {
|
|
|
|
Insts.erase(Insts.begin() + MoveDef);
|
|
|
|
Insts.erase(Insts.begin() + MoveUse);
|
|
|
|
}
|
2018-03-27 00:23:29 +08:00
|
|
|
orderDependence(SSD, UseSU, Insts);
|
|
|
|
orderDependence(SSD, SU, Insts);
|
2016-07-30 00:44:44 +08:00
|
|
|
orderDependence(SSD, DefSU, Insts);
|
2018-03-27 00:23:29 +08:00
|
|
|
return;
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
// Put the new instruction first if there is a use in the list. Otherwise,
|
|
|
|
// put it at the end of the list.
|
|
|
|
if (OrderBeforeUse)
|
|
|
|
Insts.push_front(SU);
|
|
|
|
else
|
|
|
|
Insts.push_back(SU);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if the scheduled Phi has a loop carried operand.
|
|
|
|
bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
|
|
|
|
if (!Phi.isPHI())
|
|
|
|
return false;
|
2018-06-20 13:29:26 +08:00
|
|
|
assert(Phi.isPHI() && "Expecting a Phi.");
|
2016-07-30 00:44:44 +08:00
|
|
|
SUnit *DefSU = SSD->getSUnit(&Phi);
|
|
|
|
unsigned DefCycle = cycleScheduled(DefSU);
|
|
|
|
int DefStage = stageScheduled(DefSU);
|
|
|
|
|
|
|
|
unsigned InitVal = 0;
|
|
|
|
unsigned LoopVal = 0;
|
|
|
|
getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
|
|
|
|
SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
|
|
|
|
if (!UseSU)
|
|
|
|
return true;
|
|
|
|
if (UseSU->getInstr()->isPHI())
|
|
|
|
return true;
|
|
|
|
unsigned LoopCycle = cycleScheduled(UseSU);
|
|
|
|
int LoopStage = stageScheduled(UseSU);
|
2016-11-14 18:40:23 +08:00
|
|
|
return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if the instruction is a definition that is loop carried
|
|
|
|
/// and defines the use on the next iteration.
|
|
|
|
/// v1 = phi(v2, v3)
|
|
|
|
/// (Def) v3 = op v1
|
|
|
|
/// (MO) = v1
|
|
|
|
/// If MO appears before Def, then then v1 and v3 may get assigned to the same
|
|
|
|
/// register.
|
|
|
|
bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
|
|
|
|
MachineInstr *Def, MachineOperand &MO) {
|
|
|
|
if (!MO.isReg())
|
|
|
|
return false;
|
|
|
|
if (Def->isPHI())
|
|
|
|
return false;
|
|
|
|
MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
|
|
|
|
if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
|
|
|
|
return false;
|
|
|
|
if (!isLoopCarried(SSD, *Phi))
|
|
|
|
return false;
|
|
|
|
unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
|
|
|
|
for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &DMO = Def->getOperand(i);
|
|
|
|
if (!DMO.isReg() || !DMO.isDef())
|
|
|
|
continue;
|
|
|
|
if (DMO.getReg() == LoopReg)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the generated schedule is valid. This function checks if
|
|
|
|
// an instruction that uses a physical register is scheduled in a
|
|
|
|
// different stage than the definition. The pipeliner does not handle
|
|
|
|
// physical register values that may cross a basic block boundary.
|
|
|
|
bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
|
|
|
|
for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
|
|
|
|
SUnit &SU = SSD->SUnits[i];
|
|
|
|
if (!SU.hasPhysRegDefs)
|
|
|
|
continue;
|
|
|
|
int StageDef = stageScheduled(&SU);
|
|
|
|
assert(StageDef != -1 && "Instruction should have been scheduled.");
|
|
|
|
for (auto &SI : SU.Succs)
|
|
|
|
if (SI.isAssignedRegDep())
|
2019-08-02 07:27:28 +08:00
|
|
|
if (Register::isPhysicalRegister(SI.getReg()))
|
2016-07-30 00:44:44 +08:00
|
|
|
if (stageScheduled(SI.getSUnit()) != StageDef)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
/// A property of the node order in swing-modulo-scheduling is
|
|
|
|
/// that for nodes outside circuits the following holds:
|
|
|
|
/// none of them is scheduled after both a successor and a
|
|
|
|
/// predecessor.
|
|
|
|
/// The method below checks whether the property is met.
|
|
|
|
/// If not, debug information is printed and statistics information updated.
|
|
|
|
/// Note that we do not use an assert statement.
|
|
|
|
/// The reason is that although an invalid node oder may prevent
|
|
|
|
/// the pipeliner from finding a pipelined schedule for arbitrary II,
|
|
|
|
/// it does not lead to the generation of incorrect code.
|
|
|
|
void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
|
|
|
|
|
|
|
|
// a sorted vector that maps each SUnit to its index in the NodeOrder
|
|
|
|
typedef std::pair<SUnit *, unsigned> UnitIndex;
|
|
|
|
std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
|
|
|
|
|
|
|
|
for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
|
|
|
|
Indices.push_back(std::make_pair(NodeOrder[i], i));
|
|
|
|
|
|
|
|
auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
|
|
|
|
return std::get<0>(i1) < std::get<0>(i2);
|
|
|
|
};
|
|
|
|
|
|
|
|
// sort, so that we can perform a binary search
|
llvm::sort(C.begin(), C.end(), ...) -> llvm::sort(C, ...)
Summary: The convenience wrapper in STLExtras is available since rL342102.
Reviewers: dblaikie, javed.absar, JDevlieghere, andreadb
Subscribers: MatzeB, sanjoy, arsenm, dschuff, mehdi_amini, sdardis, nemanjai, jvesely, nhaehnle, sbc100, jgravelle-google, eraman, aheejin, kbarton, JDevlieghere, javed.absar, gbedwell, jrtc27, mgrang, atanasyan, steven_wu, george.burgess.iv, dexonsmith, kristina, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D52573
llvm-svn: 343163
2018-09-27 10:13:45 +08:00
|
|
|
llvm::sort(Indices, CompareKey);
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
|
|
|
|
bool Valid = true;
|
2018-03-17 05:21:23 +08:00
|
|
|
(void)Valid;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
// for each SUnit in the NodeOrder, check whether
|
|
|
|
// it appears after both a successor and a predecessor
|
|
|
|
// of the SUnit. If this is the case, and the SUnit
|
|
|
|
// is not part of circuit, then the NodeOrder is not
|
|
|
|
// valid.
|
|
|
|
for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
|
|
|
|
SUnit *SU = NodeOrder[i];
|
|
|
|
unsigned Index = i;
|
|
|
|
|
|
|
|
bool PredBefore = false;
|
|
|
|
bool SuccBefore = false;
|
|
|
|
|
|
|
|
SUnit *Succ;
|
|
|
|
SUnit *Pred;
|
2018-03-17 05:21:23 +08:00
|
|
|
(void)Succ;
|
|
|
|
(void)Pred;
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
|
|
|
|
for (SDep &PredEdge : SU->Preds) {
|
|
|
|
SUnit *PredSU = PredEdge.getSUnit();
|
2019-06-21 13:40:31 +08:00
|
|
|
unsigned PredIndex = std::get<1>(
|
|
|
|
*llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
|
|
|
|
PredBefore = true;
|
|
|
|
Pred = PredSU;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (SDep &SuccEdge : SU->Succs) {
|
|
|
|
SUnit *SuccSU = SuccEdge.getSUnit();
|
2019-06-14 05:51:12 +08:00
|
|
|
// Do not process a boundary node, it was not included in NodeOrder,
|
|
|
|
// hence not in Indices either, call to std::lower_bound() below will
|
|
|
|
// return Indices.end().
|
|
|
|
if (SuccSU->isBoundaryNode())
|
|
|
|
continue;
|
2019-06-21 13:40:31 +08:00
|
|
|
unsigned SuccIndex = std::get<1>(
|
|
|
|
*llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
|
|
|
|
SuccBefore = true;
|
|
|
|
Succ = SuccSU;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
|
|
|
|
// instructions in circuits are allowed to be scheduled
|
|
|
|
// after both a successor and predecessor.
|
2019-06-21 13:40:31 +08:00
|
|
|
bool InCircuit = llvm::any_of(
|
|
|
|
Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
if (InCircuit)
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
else {
|
|
|
|
Valid = false;
|
|
|
|
NumNodeOrderIssues++;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Predecessor ";);
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
}
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
|
|
|
|
<< " are scheduled before node " << SU->NodeNum
|
|
|
|
<< "\n";);
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
[Pipeliner] Fixed node order issue related to zero latency edges
Summary:
A desired property of the node order in Swing Modulo Scheduling is
that for nodes outside circuits the following holds: none of them is
scheduled after both a successor and a predecessor. We call
node orders that meet this property valid.
Although invalid node orders do not lead to the generation of incorrect
code, they can cause the pipeliner not being able to find a pipelined schedule
for arbitrary II. The reason is that after scheduling the successor and the
predecessor of a node, no room may be left to schedule the node itself.
For data flow graphs with 0-latency edges, the node ordering algorithm
of Swing Modulo Scheduling can generate such undesired invalid node orders.
This patch fixes that.
In the remainder of this commit message, I will give an example
demonstrating the issue, explain the fix, and explain how the the fix is tested.
Consider, as an example, the following data flow graph with all
edge latencies 0 and all edges pointing downward.
```
n0
/ \
n1 n3
\ /
n2
|
n4
```
Consider the implemented node order algorithm in top-down mode. In that mode,
the algorithm orders the nodes based on greatest Height and in case of equal
Height on lowest Movability. Finally, in case of equal Height and
Movability, given two nodes with an edge between them, the algorithm prefers
the source-node.
In the graph, for every node, the Height and Movability are equal to 0.
As will be explained below, the algorithm can generate the order n0, n1, n2, n3, n4.
So, node n3 is scheduled after its predecessor n0 and after its successor n2.
The reason that the algorithm can put node n2 in the order before node n3,
even though they have an edge between them in which node n3 is the source,
is the following: Suppose the algorithm has constructed the partial node
order n0, n1. Then, the nodes left to be ordered are nodes n2, n3, and n4. Suppose
that the while-loop in the implemented algorithm considers the nodes in
the order n4, n3, n2. The algorithm will start with node n4, and look for
more preferable nodes. First, node n4 will be compared with node n3. As the nodes
have equal Height and Movability and have no edge between them, the algorithm
will stick with node n4. Then node n4 is compared with node n2. Again the
Height and Movability are equal. But, this time, there is an edge between
the two nodes, and the algorithm will prefer the source node n2.
As there are no nodes left to compare, the algorithm will add node n2 to
the node order, yielding the partial node order n0, n1, n2. In this way node n2
arrives in the node-order before node n3.
To solve this, this patch introduces the ZeroLatencyHeight (ZLH) property
for nodes. It is defined as the maximum unweighted length of a path from the
given node to an arbitrary node in which each edge has latency 0.
So, ZLH(n0)=3, ZLH(n1)=ZLH(n3)=2, ZLH(n2)=1, and ZLH(n4)=0
In this patch, the preference for a greater ZeroLatencyHeight
is added in the top-down mode of the node ordering algorithm, after the
preference for a greater Height, and before the preference for a
lower Movability.
Therefore, the two allowed node-orders are n0, n1, n3, n2, n4 and n0, n3, n1, n2, n4.
Both of them are valid node orders.
In the same way, the bottom-up mode of the node ordering algorithm is adapted
by introducing the ZeroLatencyDepth property for nodes.
The patch is tested by adding extra checks to the following existing
lit-tests:
test/CodeGen/Hexagon/SUnit-boundary-prob.ll
test/CodeGen/Hexagon/frame-offset-overflow.ll
test/CodeGen/Hexagon/vect/vect-shuffle.ll
Before this patch, the pipeliner failed to pipeline the loops in these tests
due to invalid node-orders. After the patch, the pipeliner successfully
pipelines all these loops.
Reviewers: bcahoon
Reviewed By: bcahoon
Subscribers: Ayal, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D43620
llvm-svn: 326925
2018-03-08 02:53:36 +08:00
|
|
|
if (!Valid)
|
|
|
|
dbgs() << "Invalid node order found!\n";
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2017-10-11 23:51:44 +08:00
|
|
|
/// Attempt to fix the degenerate cases when the instruction serialization
|
|
|
|
/// causes the register lifetimes to overlap. For example,
|
|
|
|
/// p' = store_pi(p, b)
|
|
|
|
/// = load p, offset
|
|
|
|
/// In this case p and p' overlap, which means that two registers are needed.
|
|
|
|
/// Instead, this function changes the load to use p' and updates the offset.
|
|
|
|
void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
|
|
|
|
unsigned OverlapReg = 0;
|
|
|
|
unsigned NewBaseReg = 0;
|
|
|
|
for (SUnit *SU : Instrs) {
|
|
|
|
MachineInstr *MI = SU->getInstr();
|
|
|
|
for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
|
|
|
|
const MachineOperand &MO = MI->getOperand(i);
|
|
|
|
// Look for an instruction that uses p. The instruction occurs in the
|
|
|
|
// same cycle but occurs later in the serialized order.
|
|
|
|
if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
|
|
|
|
// Check that the instruction appears in the InstrChanges structure,
|
|
|
|
// which contains instructions that can have the offset updated.
|
|
|
|
DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
|
|
|
|
InstrChanges.find(SU);
|
|
|
|
if (It != InstrChanges.end()) {
|
|
|
|
unsigned BasePos, OffsetPos;
|
|
|
|
// Update the base register and adjust the offset.
|
|
|
|
if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
|
2017-10-11 23:59:51 +08:00
|
|
|
MachineInstr *NewMI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI->getOperand(BasePos).setReg(NewBaseReg);
|
|
|
|
int64_t NewOffset =
|
|
|
|
MI->getOperand(OffsetPos).getImm() - It->second.second;
|
|
|
|
NewMI->getOperand(OffsetPos).setImm(NewOffset);
|
|
|
|
SU->setInstr(NewMI);
|
|
|
|
MISUnitMap[NewMI] = SU;
|
2019-08-31 02:49:50 +08:00
|
|
|
NewMIs[MI] = NewMI;
|
2017-10-11 23:51:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
OverlapReg = 0;
|
|
|
|
NewBaseReg = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Look for an instruction of the form p' = op(p), which uses and defines
|
|
|
|
// two virtual registers that get allocated to the same physical register.
|
|
|
|
unsigned TiedUseIdx = 0;
|
|
|
|
if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
|
|
|
|
// OverlapReg is p in the example above.
|
|
|
|
OverlapReg = MI->getOperand(TiedUseIdx).getReg();
|
|
|
|
// NewBaseReg is p' in the example above.
|
|
|
|
NewBaseReg = MI->getOperand(i).getReg();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-30 00:44:44 +08:00
|
|
|
/// After the schedule has been formed, call this function to combine
|
|
|
|
/// the instructions from the different stages/cycles. That is, this
|
|
|
|
/// function creates a schedule that represents a single iteration.
|
|
|
|
void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
|
|
|
|
// Move all instructions to the first stage from later stages.
|
|
|
|
for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
|
|
|
|
for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
|
|
|
|
++stage) {
|
|
|
|
std::deque<SUnit *> &cycleInstrs =
|
|
|
|
ScheduledInstrs[cycle + (stage * InitiationInterval)];
|
|
|
|
for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
|
|
|
|
E = cycleInstrs.rend();
|
|
|
|
I != E; ++I)
|
|
|
|
ScheduledInstrs[cycle].push_front(*I);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Erase all the elements in the later stages. Only one iteration should
|
|
|
|
// remain in the scheduled list, and it contains all the instructions.
|
|
|
|
for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
|
|
|
|
ScheduledInstrs.erase(cycle);
|
|
|
|
|
|
|
|
// Change the registers in instruction as specified in the InstrChanges
|
|
|
|
// map. We need to use the new registers to create the correct order.
|
|
|
|
for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
|
|
|
|
SUnit *SU = &SSD->SUnits[i];
|
2017-10-11 23:51:44 +08:00
|
|
|
SSD->applyInstrChange(SU->getInstr(), *this);
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Reorder the instructions in each cycle to fix and improve the
|
|
|
|
// generated code.
|
|
|
|
for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
|
|
|
|
std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
|
2018-03-27 00:23:29 +08:00
|
|
|
std::deque<SUnit *> newOrderPhi;
|
2016-07-30 00:44:44 +08:00
|
|
|
for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
|
|
|
|
SUnit *SU = cycleInstrs[i];
|
2018-03-27 00:23:29 +08:00
|
|
|
if (SU->getInstr()->isPHI())
|
|
|
|
newOrderPhi.push_back(SU);
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
std::deque<SUnit *> newOrderI;
|
|
|
|
for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
|
|
|
|
SUnit *SU = cycleInstrs[i];
|
2018-03-27 00:23:29 +08:00
|
|
|
if (!SU->getInstr()->isPHI())
|
2016-07-30 00:44:44 +08:00
|
|
|
orderDependence(SSD, SU, newOrderI);
|
|
|
|
}
|
|
|
|
// Replace the old order with the new order.
|
2018-03-27 00:23:29 +08:00
|
|
|
cycleInstrs.swap(newOrderPhi);
|
2016-07-30 00:44:44 +08:00
|
|
|
cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
|
2017-10-11 23:51:44 +08:00
|
|
|
SSD->fixupRegisterOverlaps(cycleInstrs);
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dump(););
|
2016-07-30 00:44:44 +08:00
|
|
|
}
|
|
|
|
|
2019-01-15 01:24:11 +08:00
|
|
|
void NodeSet::print(raw_ostream &os) const {
|
|
|
|
os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
|
|
|
|
<< " depth " << MaxDepth << " col " << Colocate << "\n";
|
|
|
|
for (const auto &I : Nodes)
|
|
|
|
os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
|
|
|
|
os << "\n";
|
|
|
|
}
|
|
|
|
|
2017-10-15 22:32:27 +08:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2016-07-30 00:44:44 +08:00
|
|
|
/// Print the schedule information to the given output.
|
|
|
|
void SMSchedule::print(raw_ostream &os) const {
|
|
|
|
// Iterate over each cycle.
|
|
|
|
for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
|
|
|
|
// Iterate over each instruction in the cycle.
|
|
|
|
const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
|
|
|
|
for (SUnit *CI : cycleInstrs->second) {
|
|
|
|
os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
|
|
|
|
os << "(" << CI->NodeNum << ") ";
|
|
|
|
CI->getInstr()->print(os);
|
|
|
|
os << "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Utility function used for debugging to print the schedule.
|
2017-01-28 10:02:38 +08:00
|
|
|
LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
|
2019-01-15 01:24:11 +08:00
|
|
|
LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
|
|
|
|
|
2017-01-28 10:02:38 +08:00
|
|
|
#endif
|
2019-01-15 01:24:11 +08:00
|
|
|
|
2019-05-29 11:02:59 +08:00
|
|
|
void ResourceManager::initProcResourceVectors(
|
|
|
|
const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
|
|
|
|
unsigned ProcResourceID = 0;
|
|
|
|
|
|
|
|
// We currently limit the resource kinds to 64 and below so that we can use
|
|
|
|
// uint64_t for Masks
|
|
|
|
assert(SM.getNumProcResourceKinds() < 64 &&
|
|
|
|
"Too many kinds of resources, unsupported");
|
|
|
|
// Create a unique bitmask for every processor resource unit.
|
|
|
|
// Skip resource at index 0, since it always references 'InvalidUnit'.
|
|
|
|
Masks.resize(SM.getNumProcResourceKinds());
|
|
|
|
for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
|
|
|
|
const MCProcResourceDesc &Desc = *SM.getProcResource(I);
|
|
|
|
if (Desc.SubUnitsIdxBegin)
|
|
|
|
continue;
|
|
|
|
Masks[I] = 1ULL << ProcResourceID;
|
|
|
|
ProcResourceID++;
|
|
|
|
}
|
|
|
|
// Create a unique bitmask for every processor resource group.
|
|
|
|
for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
|
|
|
|
const MCProcResourceDesc &Desc = *SM.getProcResource(I);
|
|
|
|
if (!Desc.SubUnitsIdxBegin)
|
|
|
|
continue;
|
|
|
|
Masks[I] = 1ULL << ProcResourceID;
|
|
|
|
for (unsigned U = 0; U < Desc.NumUnits; ++U)
|
|
|
|
Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
|
|
|
|
ProcResourceID++;
|
|
|
|
}
|
|
|
|
LLVM_DEBUG({
|
2019-06-19 04:24:49 +08:00
|
|
|
if (SwpShowResMask) {
|
|
|
|
dbgs() << "ProcResourceDesc:\n";
|
|
|
|
for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
|
|
|
|
const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
|
|
|
|
dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
|
|
|
|
ProcResource->Name, I, Masks[I],
|
|
|
|
ProcResource->NumUnits);
|
|
|
|
}
|
|
|
|
dbgs() << " -----------------\n";
|
2019-05-29 11:02:59 +08:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {
|
|
|
|
|
2019-06-19 04:24:49 +08:00
|
|
|
LLVM_DEBUG({
|
|
|
|
if (SwpDebugResource)
|
|
|
|
dbgs() << "canReserveResources:\n";
|
|
|
|
});
|
2019-05-29 11:02:59 +08:00
|
|
|
if (UseDFA)
|
|
|
|
return DFAResources->canReserveResources(MID);
|
|
|
|
|
|
|
|
unsigned InsnClass = MID->getSchedClass();
|
|
|
|
const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
|
|
|
|
if (!SCDesc->isValid()) {
|
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << "No valid Schedule Class Desc for schedClass!\n";
|
|
|
|
dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
|
|
|
|
});
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc);
|
|
|
|
const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc);
|
|
|
|
for (; I != E; ++I) {
|
|
|
|
if (!I->Cycles)
|
|
|
|
continue;
|
|
|
|
const MCProcResourceDesc *ProcResource =
|
|
|
|
SM.getProcResource(I->ProcResourceIdx);
|
|
|
|
unsigned NumUnits = ProcResource->NumUnits;
|
|
|
|
LLVM_DEBUG({
|
2019-06-19 04:24:49 +08:00
|
|
|
if (SwpDebugResource)
|
|
|
|
dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
|
|
|
|
ProcResource->Name, I->ProcResourceIdx,
|
|
|
|
ProcResourceCount[I->ProcResourceIdx], NumUnits,
|
|
|
|
I->Cycles);
|
2019-05-29 11:02:59 +08:00
|
|
|
});
|
|
|
|
if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits)
|
|
|
|
return false;
|
|
|
|
}
|
2019-06-19 04:24:49 +08:00
|
|
|
LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return true\n\n";);
|
2019-05-29 11:02:59 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ResourceManager::reserveResources(const MCInstrDesc *MID) {
|
2019-06-19 04:24:49 +08:00
|
|
|
LLVM_DEBUG({
|
|
|
|
if (SwpDebugResource)
|
|
|
|
dbgs() << "reserveResources:\n";
|
|
|
|
});
|
2019-05-29 11:02:59 +08:00
|
|
|
if (UseDFA)
|
|
|
|
return DFAResources->reserveResources(MID);
|
|
|
|
|
|
|
|
unsigned InsnClass = MID->getSchedClass();
|
|
|
|
const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
|
|
|
|
if (!SCDesc->isValid()) {
|
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << "No valid Schedule Class Desc for schedClass!\n";
|
|
|
|
dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
|
|
|
|
});
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
for (const MCWriteProcResEntry &PRE :
|
|
|
|
make_range(STI->getWriteProcResBegin(SCDesc),
|
|
|
|
STI->getWriteProcResEnd(SCDesc))) {
|
|
|
|
if (!PRE.Cycles)
|
|
|
|
continue;
|
|
|
|
++ProcResourceCount[PRE.ProcResourceIdx];
|
|
|
|
LLVM_DEBUG({
|
2019-06-19 04:24:49 +08:00
|
|
|
if (SwpDebugResource) {
|
|
|
|
const MCProcResourceDesc *ProcResource =
|
|
|
|
SM.getProcResource(PRE.ProcResourceIdx);
|
|
|
|
dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
|
|
|
|
ProcResource->Name, PRE.ProcResourceIdx,
|
|
|
|
ProcResourceCount[PRE.ProcResourceIdx],
|
|
|
|
ProcResource->NumUnits, PRE.Cycles);
|
|
|
|
}
|
2019-05-29 11:02:59 +08:00
|
|
|
});
|
|
|
|
}
|
2019-06-19 04:24:49 +08:00
|
|
|
LLVM_DEBUG({
|
|
|
|
if (SwpDebugResource)
|
|
|
|
dbgs() << "reserveResources: done!\n\n";
|
|
|
|
});
|
2019-05-29 11:02:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool ResourceManager::canReserveResources(const MachineInstr &MI) const {
|
|
|
|
return canReserveResources(&MI.getDesc());
|
|
|
|
}
|
|
|
|
|
|
|
|
void ResourceManager::reserveResources(const MachineInstr &MI) {
|
|
|
|
return reserveResources(&MI.getDesc());
|
|
|
|
}
|
2019-01-15 01:24:11 +08:00
|
|
|
|
2019-05-29 11:02:59 +08:00
|
|
|
void ResourceManager::clearResources() {
|
|
|
|
if (UseDFA)
|
|
|
|
return DFAResources->clearResources();
|
|
|
|
std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
|
|
|
|
}
|