llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp

1461 lines
52 KiB
C++
Raw Normal View History

//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This implements the ScheduleDAGInstrs class, which implements
/// re-scheduling of MachineInstrs.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/ADT/IntEqClasses.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SparseSet.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDFS.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <iterator>
#include <string>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
cl::ZeroOrMore, cl::init(false),
cl::desc("Enable use of AA during MI DAG construction"));
static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
// Note: the two options below might be used in tuning compile time vs
// output quality. Setting HugeRegion so large that it will never be
// reached means best-effort, but may be slow.
// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
// together hold this many SUs, a reduction of maps will be done.
static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
cl::init(1000), cl::desc("The limit to use while constructing the DAG "
"prior to scheduling, at which point a trade-off "
"is made to avoid excessive compile time."));
static cl::opt<unsigned> ReductionSize(
"dag-maps-reduction-size", cl::Hidden,
cl::desc("A huge scheduling region will have maps reduced by this many "
"nodes at a time. Defaults to HugeRegion / 2."));
static unsigned getReductionSize() {
// Always reduce a huge region with half of the elements, except
// when user sets this number explicitly.
if (ReductionSize.getNumOccurrences() == 0)
return HugeRegion / 2;
return ReductionSize;
}
static void dumpSUList(ScheduleDAGInstrs::SUList &L) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dbgs() << "{ ";
for (const SUnit *su : L) {
dbgs() << "SU(" << su->NodeNum << ")";
if (su != L.back())
dbgs() << ", ";
}
dbgs() << "}\n";
#endif
}
ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
const MachineLoopInfo *mli,
bool RemoveKillFlags)
: ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
RemoveKillFlags(RemoveKillFlags),
UnknownValue(UndefValue::get(
Type::getVoidTy(mf.getFunction().getContext()))) {
DbgValues.clear();
const TargetSubtargetInfo &ST = mf.getSubtarget();
SchedModel.init(&ST);
}
/// If this machine instr has memory reference information and it can be
/// tracked to a normal reference to a known object, return the Value
/// for that object. This function returns false the memory location is
/// unknown or may alias anything.
static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
const MachineFrameInfo &MFI,
UnderlyingObjectsVector &Objects,
const DataLayout &DL) {
auto allMMOsOkay = [&]() {
for (const MachineMemOperand *MMO : MI->memoperands()) {
if (MMO->isVolatile())
return false;
if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
// Function that contain tail calls don't have unique PseudoSourceValue
// objects. Two PseudoSourceValues might refer to the same or
// overlapping locations. The client code calling this function assumes
// this is not the case. So return a conservative answer of no known
// object.
if (MFI.hasTailCall())
return false;
// For now, ignore PseudoSourceValues which may alias LLVM IR values
// because the code that uses this function has no way to cope with
// such aliases.
if (PSV->isAliased(&MFI))
return false;
bool MayAlias = PSV->mayAlias(&MFI);
Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
} else if (const Value *V = MMO->getValue()) {
SmallVector<Value *, 4> Objs;
if (!getUnderlyingObjectsForCodeGen(V, Objs, DL))
return false;
for (Value *V : Objs) {
assert(isIdentifiedObject(V));
Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
}
} else
return false;
}
return true;
};
if (!allMMOsOkay()) {
Objects.clear();
return false;
}
return true;
}
void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
BB = bb;
}
void ScheduleDAGInstrs::finishBlock() {
// Subclasses should no longer refer to the old block.
BB = nullptr;
misched preparation: clarify ScheduleDAG and ScheduleDAGInstrs roles. ScheduleDAG is responsible for the DAG: SUnits and SDeps. It provides target hooks for latency computation. ScheduleDAGInstrs extends ScheduleDAG and defines the current scheduling region in terms of MachineInstr iterators. It has access to the target's scheduling itinerary data. ScheduleDAGInstrs provides the logic for building the ScheduleDAG for the sequence of MachineInstrs in the current region. Target's can implement highly custom schedulers by extending this class. ScheduleDAGPostRATDList provides the driver and diagnostics for current postRA scheduling. It maintains a current Sequence of scheduled machine instructions and logic for splicing them into the block. During scheduling, it uses the ScheduleHazardRecognizer provided by the target. Specific changes: - Removed driver code from ScheduleDAG. clearDAG is the only interface needed. - Added enterRegion/exitRegion hooks to ScheduleDAGInstrs to delimit the scope of each scheduling region and associated DAG. They should be used to setup and cleanup any region-specific state in addition to the DAG itself. This is necessary because we reuse the same ScheduleDAG object for the entire function. The target may extend these hooks to do things at regions boundaries, like bundle terminators. The hooks are called even if we decide not to schedule the region. So all instructions in a block are "covered" by these calls. - Added ScheduleDAGInstrs::begin()/end() public API. - Moved Sequence into the driver layer, which is specific to the scheduling algorithm. llvm-svn: 152208
2012-03-07 13:21:52 +08:00
}
void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
MachineBasicBlock::iterator begin,
MachineBasicBlock::iterator end,
unsigned regioninstrs) {
assert(bb == BB && "startBlock should set BB");
RegionBegin = begin;
RegionEnd = end;
NumRegionInstrs = regioninstrs;
misched preparation: clarify ScheduleDAG and ScheduleDAGInstrs roles. ScheduleDAG is responsible for the DAG: SUnits and SDeps. It provides target hooks for latency computation. ScheduleDAGInstrs extends ScheduleDAG and defines the current scheduling region in terms of MachineInstr iterators. It has access to the target's scheduling itinerary data. ScheduleDAGInstrs provides the logic for building the ScheduleDAG for the sequence of MachineInstrs in the current region. Target's can implement highly custom schedulers by extending this class. ScheduleDAGPostRATDList provides the driver and diagnostics for current postRA scheduling. It maintains a current Sequence of scheduled machine instructions and logic for splicing them into the block. During scheduling, it uses the ScheduleHazardRecognizer provided by the target. Specific changes: - Removed driver code from ScheduleDAG. clearDAG is the only interface needed. - Added enterRegion/exitRegion hooks to ScheduleDAGInstrs to delimit the scope of each scheduling region and associated DAG. They should be used to setup and cleanup any region-specific state in addition to the DAG itself. This is necessary because we reuse the same ScheduleDAG object for the entire function. The target may extend these hooks to do things at regions boundaries, like bundle terminators. The hooks are called even if we decide not to schedule the region. So all instructions in a block are "covered" by these calls. - Added ScheduleDAGInstrs::begin()/end() public API. - Moved Sequence into the driver layer, which is specific to the scheduling algorithm. llvm-svn: 152208
2012-03-07 13:21:52 +08:00
}
void ScheduleDAGInstrs::exitRegion() {
// Nothing to do.
}
void ScheduleDAGInstrs::addSchedBarrierDeps() {
MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
ExitSU.setInstr(ExitMI);
// Add dependencies on the defs and uses of the instruction.
if (ExitMI) {
for (const MachineOperand &MO : ExitMI->operands()) {
if (!MO.isReg() || MO.isDef()) continue;
unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
} else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
}
}
}
if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
// For others, e.g. fallthrough, conditional branch, assume the exit
// uses all the registers that are livein to the successor blocks.
for (const MachineBasicBlock *Succ : BB->successors()) {
for (const auto &LI : Succ->liveins()) {
if (!Uses.contains(LI.PhysReg))
Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
}
}
}
}
/// MO is an operand of SU's instruction that defines a physical register. Adds
/// data dependencies from SU to any uses of the physical register.
void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
assert(MO.isDef() && "expect physreg def");
// Ask the target if address-backscheduling is desirable, and if so how much.
const TargetSubtargetInfo &ST = MF.getSubtarget();
// Only use any non-zero latency for real defs/uses, in contrast to
// "fake" operands added by regalloc.
const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc();
bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() &&
!DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg()));
for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
Alias.isValid(); ++Alias) {
if (!Uses.contains(*Alias))
continue;
for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
SUnit *UseSU = I->SU;
if (UseSU == SU)
continue;
// Adjust the dependence latency using operand def/use information,
// then allow the target to perform its own adjustments.
int UseOp = I->OpIdx;
MachineInstr *RegUse = nullptr;
SDep Dep;
if (UseOp < 0)
Dep = SDep(SU, SDep::Artificial);
else {
// Set the hasPhysRegDefs only for physreg defs that have a use within
// the scheduling region.
SU->hasPhysRegDefs = true;
Dep = SDep(SU, SDep::Data, *Alias);
RegUse = UseSU->getInstr();
}
const MCInstrDesc *UseMIDesc =
(RegUse ? &UseSU->getInstr()->getDesc() : nullptr);
bool ImplicitPseudoUse =
(UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) &&
!UseMIDesc->hasImplicitUseOfPhysReg(*Alias));
if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
RegUse, UseOp));
ST.adjustSchedDependency(SU, UseSU, Dep);
} else
Dep.setLatency(0);
UseSU->addPred(Dep);
}
}
}
/// Adds register dependencies (data, anti, and output) from this SUnit
/// to following instructions in the same scheduling region that depend the
/// physical register referenced at OperIdx.
void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
MachineInstr *MI = SU->getInstr();
MachineOperand &MO = MI->getOperand(OperIdx);
unsigned Reg = MO.getReg();
// We do not need to track any dependencies for constant registers.
if (MRI.isConstantPhysReg(Reg))
return;
// Optionally add output and anti dependencies. For anti
// dependencies we use a latency of 0 because for a multi-issue
// target we want to allow the defining instruction to issue
// in the same cycle as the using instruction.
// TODO: Using a latency of 1 here for output dependencies assumes
// there's no cost for reusing registers.
SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
if (!Defs.contains(*Alias))
continue;
for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
SUnit *DefSU = I->SU;
if (DefSU == &ExitSU)
continue;
if (DefSU != SU &&
(Kind != SDep::Output || !MO.isDead() ||
!DefSU->getInstr()->registerDefIsDead(*Alias))) {
if (Kind == SDep::Anti)
DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
else {
SDep Dep(SU, Kind, /*Reg=*/*Alias);
Dep.setLatency(
SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
DefSU->addPred(Dep);
}
}
}
}
if (!MO.isDef()) {
SU->hasPhysRegUses = true;
// Either insert a new Reg2SUnits entry with an empty SUnits list, or
// retrieve the existing SUnits list for this register's uses.
// Push this SUnit on the use list.
Uses.insert(PhysRegSUOper(SU, OperIdx, Reg));
if (RemoveKillFlags)
MO.setIsKill(false);
} else {
addPhysRegDataDeps(SU, OperIdx);
// Clear previous uses and defs of this register and its subergisters.
for (MCSubRegIterator SubReg(Reg, TRI, true); SubReg.isValid(); ++SubReg) {
if (Uses.contains(*SubReg))
Uses.eraseAll(*SubReg);
if (!MO.isDead())
Defs.eraseAll(*SubReg);
}
if (MO.isDead() && SU->isCall) {
// Calls will not be reordered because of chain dependencies (see
// below). Since call operands are dead, calls may continue to be added
// to the DefList making dependence checking quadratic in the size of
// the block. Instead, we leave only one call at the back of the
// DefList.
Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
Reg2SUnitsMap::iterator B = P.first;
Reg2SUnitsMap::iterator I = P.second;
for (bool isBegin = I == B; !isBegin; /* empty */) {
isBegin = (--I) == B;
if (!I->SU->isCall)
break;
I = Defs.erase(I);
}
}
// Defs are pushed in the order they are visited and never reordered.
Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
}
}
LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
{
unsigned Reg = MO.getReg();
// No point in tracking lanemasks if we don't have interesting subregisters.
const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
if (!RC.HasDisjunctSubRegs)
return LaneBitmask::getAll();
unsigned SubReg = MO.getSubReg();
if (SubReg == 0)
return RC.getLaneMask();
return TRI->getSubRegIndexLaneMask(SubReg);
}
/// Adds register output and data dependencies from this SUnit to instructions
/// that occur later in the same scheduling region if they read from or write to
/// the virtual register defined at OperIdx.
///
/// TODO: Hoist loop induction variable increments. This has to be
/// reevaluated. Generally, IV scheduling should be done before coalescing.
void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
MachineInstr *MI = SU->getInstr();
MachineOperand &MO = MI->getOperand(OperIdx);
unsigned Reg = MO.getReg();
LaneBitmask DefLaneMask;
LaneBitmask KillLaneMask;
if (TrackLaneMasks) {
bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
DefLaneMask = getLaneMaskForMO(MO);
// If we have a <read-undef> flag, none of the lane values comes from an
// earlier instruction.
KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
// Clear undef flag, we'll re-add it later once we know which subregister
// Def is first.
MO.setIsUndef(false);
} else {
DefLaneMask = LaneBitmask::getAll();
KillLaneMask = LaneBitmask::getAll();
}
if (MO.isDead()) {
assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
"Dead defs should have no uses");
} else {
// Add data dependence to all uses we found so far.
const TargetSubtargetInfo &ST = MF.getSubtarget();
for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
E = CurrentVRegUses.end(); I != E; /*empty*/) {
LaneBitmask LaneMask = I->LaneMask;
// Ignore uses of other lanes.
if ((LaneMask & KillLaneMask).none()) {
++I;
continue;
}
if ((LaneMask & DefLaneMask).any()) {
SUnit *UseSU = I->SU;
MachineInstr *Use = UseSU->getInstr();
SDep Dep(SU, SDep::Data, Reg);
Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
I->OperandIndex));
ST.adjustSchedDependency(SU, UseSU, Dep);
UseSU->addPred(Dep);
}
LaneMask &= ~KillLaneMask;
// If we found a Def for all lanes of this use, remove it from the list.
if (LaneMask.any()) {
I->LaneMask = LaneMask;
++I;
} else
I = CurrentVRegUses.erase(I);
}
}
// Shortcut: Singly defined vregs do not have output/anti dependencies.
if (MRI.hasOneDef(Reg))
return;
// Add output dependence to the next nearest defs of this vreg.
//
// Unless this definition is dead, the output dependence should be
// transitively redundant with antidependencies from this definition's
// uses. We're conservative for now until we have a way to guarantee the uses
// are not eliminated sometime during scheduling. The output dependence edge
// is also useful if output latency exceeds def-use latency.
LaneBitmask LaneMask = DefLaneMask;
for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
CurrentVRegDefs.end())) {
// Ignore defs for other lanes.
if ((V2SU.LaneMask & LaneMask).none())
continue;
// Add an output dependence.
SUnit *DefSU = V2SU.SU;
// Ignore additional defs of the same lanes in one instruction. This can
// happen because lanemasks are shared for targets with too many
// subregisters. We also use some representration tricks/hacks where we
// add super-register defs/uses, to imply that although we only access parts
// of the reg we care about the full one.
if (DefSU == SU)
continue;
SDep Dep(SU, SDep::Output, Reg);
Dep.setLatency(
SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
DefSU->addPred(Dep);
// Update current definition. This can get tricky if the def was about a
// bigger lanemask before. We then have to shrink it and create a new
// VReg2SUnit for the non-overlapping part.
LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
V2SU.SU = SU;
V2SU.LaneMask = OverlapMask;
if (NonOverlapMask.any())
CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
}
// If there was no CurrentVRegDefs entry for some lanes yet, create one.
if (LaneMask.any())
CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
}
/// Adds a register data dependency if the instruction that defines the
/// virtual register used at OperIdx is mapped to an SUnit. Add a register
/// antidependency from this SUnit to instructions that occur later in the same
/// scheduling region if they write the virtual register.
///
/// TODO: Handle ExitSU "uses" properly.
void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
const MachineInstr *MI = SU->getInstr();
const MachineOperand &MO = MI->getOperand(OperIdx);
unsigned Reg = MO.getReg();
// Remember the use. Data dependencies will be added when we find the def.
LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
: LaneBitmask::getAll();
CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
// Add antidependences to the following defs of the vreg.
for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
CurrentVRegDefs.end())) {
// Ignore defs for unrelated lanes.
LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
if ((PrevDefLaneMask & LaneMask).none())
continue;
if (V2SU.SU == SU)
continue;
V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
}
}
/// Returns true if MI is an instruction we are unable to reason about
/// (like a call or something with unmodeled side effects).
static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
return MI->isCall() || MI->hasUnmodeledSideEffects() ||
(MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
}
void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
unsigned Latency) {
if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
SDep Dep(SUa, SDep::MayAliasMem);
Dep.setLatency(Latency);
SUb->addPred(Dep);
}
}
/// Creates an SUnit for each real instruction, numbered in top-down
/// topological order. The instruction order A < B, implies that no edge exists
/// from B to A.
///
/// Map each real instruction to its SUnit.
///
/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
/// instead of pointers.
///
/// MachineScheduler relies on initSUnits numbering the nodes by their order in
/// the original instruction list.
void ScheduleDAGInstrs::initSUnits() {
// We'll be allocating one SUnit for each real instruction in the region,
// which is contained within a basic block.
SUnits.reserve(NumRegionInstrs);
for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
if (MI.isDebugInstr())
continue;
SUnit *SU = newSUnit(&MI);
MISUnitMap[&MI] = SU;
SU->isCall = MI.isCall();
SU->isCommutable = MI.isCommutable();
// Assign the Latency field of SU using target-provided information.
SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
MI-Sched: handle latency of in-order operations with the new machine model. The per-operand machine model allows the target to define "unbuffered" processor resources. This change is a quick, cheap way to model stalls caused by the latency of operations that use such resources. This only applies when the processor's micro-op buffer size is non-zero (Out-of-Order). We can't precisely model in-order stalls during out-of-order execution, but this is an easy and effective heuristic. It benefits cortex-a9 scheduling when using the new machine model, which is not yet on by default. MI-Sched for armv7 was evaluated on Swift (and only not enabled because of a performance bug related to predication). However, we never evaluated Cortex-A9 performance on MI-Sched in its current form. This change adds MI-Sched functionality to reach performance goals on A9. The only remaining change is to allow MI-Sched to run as a PostRA pass. I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7: -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results: (min run time over 2 runs, filtering tiny changes) Speedups: | Benchmarks/BenchmarkGame/recursive | 52.39% | | Benchmarks/VersaBench/beamformer | 20.80% | | Benchmarks/Misc/pi | 19.97% | | Benchmarks/Misc/mandel-2 | 19.95% | | SPEC/CFP2000/188.ammp | 18.72% | | Benchmarks/McCat/08-main/main | 18.58% | | Benchmarks/Misc-C++/Large/sphereflake | 18.46% | | Benchmarks/Olden/power | 17.11% | | Benchmarks/Misc-C++/mandel-text | 16.47% | | Benchmarks/Misc/oourafft | 15.94% | | Benchmarks/Misc/flops-7 | 14.99% | | Benchmarks/FreeBench/distray | 14.26% | | SPEC/CFP2006/470.lbm | 14.00% | | mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% | | Benchmarks/SmallPT/smallpt | 10.36% | | Benchmarks/Misc-C++/Large/ray | 8.97% | | Benchmarks/Misc/fp-convert | 8.75% | | Benchmarks/Olden/perimeter | 7.10% | | Benchmarks/Bullet/bullet | 7.03% | | Benchmarks/Misc/mandel | 6.75% | | Benchmarks/Olden/voronoi | 6.26% | | Benchmarks/Misc/flops-8 | 5.77% | | Benchmarks/Misc/matmul_f64_4x4 | 5.19% | | Benchmarks/MiBench/security-rijndael | 5.15% | | Benchmarks/Misc/flops-6 | 5.10% | | Benchmarks/Olden/tsp | 4.46% | | Benchmarks/MiBench/consumer-lame | 4.28% | | Benchmarks/Misc/flops-5 | 4.27% | | Benchmarks/mafft/pairlocalalign | 4.19% | | Benchmarks/Misc/himenobmtxpa | 4.07% | | Benchmarks/Misc/lowercase | 4.06% | | SPEC/CFP2006/433.milc | 3.99% | | Benchmarks/tramp3d-v4 | 3.79% | | Benchmarks/FreeBench/pifft | 3.66% | | Benchmarks/Ptrdist/ks | 3.21% | | Benchmarks/Adobe-C++/loop_unroll | 3.12% | | SPEC/CINT2000/175.vpr | 3.12% | | Benchmarks/nbench | 2.98% | | SPEC/CFP2000/183.equake | 2.91% | | Benchmarks/Misc/perlin | 2.85% | | Benchmarks/Misc/flops-1 | 2.82% | | Benchmarks/Misc-C++-EH/spirit | 2.80% | | Benchmarks/Misc/flops-2 | 2.77% | | Benchmarks/NPB-serial/is | 2.42% | | Benchmarks/ASC_Sequoia/CrystalMk | 2.33% | | Benchmarks/BenchmarkGame/n-body | 2.28% | | Benchmarks/SciMark2-C/scimark2 | 2.27% | | Benchmarks/Olden/bh | 2.03% | | skidmarks10/skidmarks | 1.81% | | Benchmarks/Misc/flops | 1.72% | Slowdowns: | Benchmarks/llubenchmark/llu | -14.14% | | Benchmarks/Polybench/stencils/seidel-2d | -5.67% | | Benchmarks/Adobe-C++/functionobjects | -5.25% | | Benchmarks/Misc-C++/oopack_v1p8 | -5.00% | | Benchmarks/Shootout/hash | -2.35% | | Benchmarks/Prolangs-C++/ocean | -2.01% | | Benchmarks/Polybench/medley/floyd-warshall | -1.98% | | Polybench/linear-algebra/kernels/3mm | -1.95% | | Benchmarks/McCat/09-vor/vor | -1.68% | llvm-svn: 196516
2013-12-06 01:55:58 +08:00
// If this SUnit uses a reserved or unbuffered resource, mark it as such.
//
2014-05-15 09:52:21 +08:00
// Reserved resources block an instruction from issuing and stall the
// entire pipeline. These are identified by BufferSize=0.
//
2014-05-15 09:52:21 +08:00
// Unbuffered resources prevent execution of subsequent instructions that
// require the same resources. This is used for in-order execution pipelines
// within an out-of-order core. These are identified by BufferSize=1.
MI-Sched: handle latency of in-order operations with the new machine model. The per-operand machine model allows the target to define "unbuffered" processor resources. This change is a quick, cheap way to model stalls caused by the latency of operations that use such resources. This only applies when the processor's micro-op buffer size is non-zero (Out-of-Order). We can't precisely model in-order stalls during out-of-order execution, but this is an easy and effective heuristic. It benefits cortex-a9 scheduling when using the new machine model, which is not yet on by default. MI-Sched for armv7 was evaluated on Swift (and only not enabled because of a performance bug related to predication). However, we never evaluated Cortex-A9 performance on MI-Sched in its current form. This change adds MI-Sched functionality to reach performance goals on A9. The only remaining change is to allow MI-Sched to run as a PostRA pass. I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7: -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results: (min run time over 2 runs, filtering tiny changes) Speedups: | Benchmarks/BenchmarkGame/recursive | 52.39% | | Benchmarks/VersaBench/beamformer | 20.80% | | Benchmarks/Misc/pi | 19.97% | | Benchmarks/Misc/mandel-2 | 19.95% | | SPEC/CFP2000/188.ammp | 18.72% | | Benchmarks/McCat/08-main/main | 18.58% | | Benchmarks/Misc-C++/Large/sphereflake | 18.46% | | Benchmarks/Olden/power | 17.11% | | Benchmarks/Misc-C++/mandel-text | 16.47% | | Benchmarks/Misc/oourafft | 15.94% | | Benchmarks/Misc/flops-7 | 14.99% | | Benchmarks/FreeBench/distray | 14.26% | | SPEC/CFP2006/470.lbm | 14.00% | | mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% | | Benchmarks/SmallPT/smallpt | 10.36% | | Benchmarks/Misc-C++/Large/ray | 8.97% | | Benchmarks/Misc/fp-convert | 8.75% | | Benchmarks/Olden/perimeter | 7.10% | | Benchmarks/Bullet/bullet | 7.03% | | Benchmarks/Misc/mandel | 6.75% | | Benchmarks/Olden/voronoi | 6.26% | | Benchmarks/Misc/flops-8 | 5.77% | | Benchmarks/Misc/matmul_f64_4x4 | 5.19% | | Benchmarks/MiBench/security-rijndael | 5.15% | | Benchmarks/Misc/flops-6 | 5.10% | | Benchmarks/Olden/tsp | 4.46% | | Benchmarks/MiBench/consumer-lame | 4.28% | | Benchmarks/Misc/flops-5 | 4.27% | | Benchmarks/mafft/pairlocalalign | 4.19% | | Benchmarks/Misc/himenobmtxpa | 4.07% | | Benchmarks/Misc/lowercase | 4.06% | | SPEC/CFP2006/433.milc | 3.99% | | Benchmarks/tramp3d-v4 | 3.79% | | Benchmarks/FreeBench/pifft | 3.66% | | Benchmarks/Ptrdist/ks | 3.21% | | Benchmarks/Adobe-C++/loop_unroll | 3.12% | | SPEC/CINT2000/175.vpr | 3.12% | | Benchmarks/nbench | 2.98% | | SPEC/CFP2000/183.equake | 2.91% | | Benchmarks/Misc/perlin | 2.85% | | Benchmarks/Misc/flops-1 | 2.82% | | Benchmarks/Misc-C++-EH/spirit | 2.80% | | Benchmarks/Misc/flops-2 | 2.77% | | Benchmarks/NPB-serial/is | 2.42% | | Benchmarks/ASC_Sequoia/CrystalMk | 2.33% | | Benchmarks/BenchmarkGame/n-body | 2.28% | | Benchmarks/SciMark2-C/scimark2 | 2.27% | | Benchmarks/Olden/bh | 2.03% | | skidmarks10/skidmarks | 1.81% | | Benchmarks/Misc/flops | 1.72% | Slowdowns: | Benchmarks/llubenchmark/llu | -14.14% | | Benchmarks/Polybench/stencils/seidel-2d | -5.67% | | Benchmarks/Adobe-C++/functionobjects | -5.25% | | Benchmarks/Misc-C++/oopack_v1p8 | -5.00% | | Benchmarks/Shootout/hash | -2.35% | | Benchmarks/Prolangs-C++/ocean | -2.01% | | Benchmarks/Polybench/medley/floyd-warshall | -1.98% | | Polybench/linear-algebra/kernels/3mm | -1.95% | | Benchmarks/McCat/09-vor/vor | -1.68% | llvm-svn: 196516
2013-12-06 01:55:58 +08:00
if (SchedModel.hasInstrSchedModel()) {
const MCSchedClassDesc *SC = getSchedClass(SU);
for (const MCWriteProcResEntry &PRE :
make_range(SchedModel.getWriteProcResBegin(SC),
SchedModel.getWriteProcResEnd(SC))) {
switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) {
case 0:
SU->hasReservedResource = true;
break;
case 1:
MI-Sched: handle latency of in-order operations with the new machine model. The per-operand machine model allows the target to define "unbuffered" processor resources. This change is a quick, cheap way to model stalls caused by the latency of operations that use such resources. This only applies when the processor's micro-op buffer size is non-zero (Out-of-Order). We can't precisely model in-order stalls during out-of-order execution, but this is an easy and effective heuristic. It benefits cortex-a9 scheduling when using the new machine model, which is not yet on by default. MI-Sched for armv7 was evaluated on Swift (and only not enabled because of a performance bug related to predication). However, we never evaluated Cortex-A9 performance on MI-Sched in its current form. This change adds MI-Sched functionality to reach performance goals on A9. The only remaining change is to allow MI-Sched to run as a PostRA pass. I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7: -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results: (min run time over 2 runs, filtering tiny changes) Speedups: | Benchmarks/BenchmarkGame/recursive | 52.39% | | Benchmarks/VersaBench/beamformer | 20.80% | | Benchmarks/Misc/pi | 19.97% | | Benchmarks/Misc/mandel-2 | 19.95% | | SPEC/CFP2000/188.ammp | 18.72% | | Benchmarks/McCat/08-main/main | 18.58% | | Benchmarks/Misc-C++/Large/sphereflake | 18.46% | | Benchmarks/Olden/power | 17.11% | | Benchmarks/Misc-C++/mandel-text | 16.47% | | Benchmarks/Misc/oourafft | 15.94% | | Benchmarks/Misc/flops-7 | 14.99% | | Benchmarks/FreeBench/distray | 14.26% | | SPEC/CFP2006/470.lbm | 14.00% | | mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% | | Benchmarks/SmallPT/smallpt | 10.36% | | Benchmarks/Misc-C++/Large/ray | 8.97% | | Benchmarks/Misc/fp-convert | 8.75% | | Benchmarks/Olden/perimeter | 7.10% | | Benchmarks/Bullet/bullet | 7.03% | | Benchmarks/Misc/mandel | 6.75% | | Benchmarks/Olden/voronoi | 6.26% | | Benchmarks/Misc/flops-8 | 5.77% | | Benchmarks/Misc/matmul_f64_4x4 | 5.19% | | Benchmarks/MiBench/security-rijndael | 5.15% | | Benchmarks/Misc/flops-6 | 5.10% | | Benchmarks/Olden/tsp | 4.46% | | Benchmarks/MiBench/consumer-lame | 4.28% | | Benchmarks/Misc/flops-5 | 4.27% | | Benchmarks/mafft/pairlocalalign | 4.19% | | Benchmarks/Misc/himenobmtxpa | 4.07% | | Benchmarks/Misc/lowercase | 4.06% | | SPEC/CFP2006/433.milc | 3.99% | | Benchmarks/tramp3d-v4 | 3.79% | | Benchmarks/FreeBench/pifft | 3.66% | | Benchmarks/Ptrdist/ks | 3.21% | | Benchmarks/Adobe-C++/loop_unroll | 3.12% | | SPEC/CINT2000/175.vpr | 3.12% | | Benchmarks/nbench | 2.98% | | SPEC/CFP2000/183.equake | 2.91% | | Benchmarks/Misc/perlin | 2.85% | | Benchmarks/Misc/flops-1 | 2.82% | | Benchmarks/Misc-C++-EH/spirit | 2.80% | | Benchmarks/Misc/flops-2 | 2.77% | | Benchmarks/NPB-serial/is | 2.42% | | Benchmarks/ASC_Sequoia/CrystalMk | 2.33% | | Benchmarks/BenchmarkGame/n-body | 2.28% | | Benchmarks/SciMark2-C/scimark2 | 2.27% | | Benchmarks/Olden/bh | 2.03% | | skidmarks10/skidmarks | 1.81% | | Benchmarks/Misc/flops | 1.72% | Slowdowns: | Benchmarks/llubenchmark/llu | -14.14% | | Benchmarks/Polybench/stencils/seidel-2d | -5.67% | | Benchmarks/Adobe-C++/functionobjects | -5.25% | | Benchmarks/Misc-C++/oopack_v1p8 | -5.00% | | Benchmarks/Shootout/hash | -2.35% | | Benchmarks/Prolangs-C++/ocean | -2.01% | | Benchmarks/Polybench/medley/floyd-warshall | -1.98% | | Polybench/linear-algebra/kernels/3mm | -1.95% | | Benchmarks/McCat/09-vor/vor | -1.68% | llvm-svn: 196516
2013-12-06 01:55:58 +08:00
SU->isUnbuffered = true;
break;
default:
break;
MI-Sched: handle latency of in-order operations with the new machine model. The per-operand machine model allows the target to define "unbuffered" processor resources. This change is a quick, cheap way to model stalls caused by the latency of operations that use such resources. This only applies when the processor's micro-op buffer size is non-zero (Out-of-Order). We can't precisely model in-order stalls during out-of-order execution, but this is an easy and effective heuristic. It benefits cortex-a9 scheduling when using the new machine model, which is not yet on by default. MI-Sched for armv7 was evaluated on Swift (and only not enabled because of a performance bug related to predication). However, we never evaluated Cortex-A9 performance on MI-Sched in its current form. This change adds MI-Sched functionality to reach performance goals on A9. The only remaining change is to allow MI-Sched to run as a PostRA pass. I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7: -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results: (min run time over 2 runs, filtering tiny changes) Speedups: | Benchmarks/BenchmarkGame/recursive | 52.39% | | Benchmarks/VersaBench/beamformer | 20.80% | | Benchmarks/Misc/pi | 19.97% | | Benchmarks/Misc/mandel-2 | 19.95% | | SPEC/CFP2000/188.ammp | 18.72% | | Benchmarks/McCat/08-main/main | 18.58% | | Benchmarks/Misc-C++/Large/sphereflake | 18.46% | | Benchmarks/Olden/power | 17.11% | | Benchmarks/Misc-C++/mandel-text | 16.47% | | Benchmarks/Misc/oourafft | 15.94% | | Benchmarks/Misc/flops-7 | 14.99% | | Benchmarks/FreeBench/distray | 14.26% | | SPEC/CFP2006/470.lbm | 14.00% | | mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% | | Benchmarks/SmallPT/smallpt | 10.36% | | Benchmarks/Misc-C++/Large/ray | 8.97% | | Benchmarks/Misc/fp-convert | 8.75% | | Benchmarks/Olden/perimeter | 7.10% | | Benchmarks/Bullet/bullet | 7.03% | | Benchmarks/Misc/mandel | 6.75% | | Benchmarks/Olden/voronoi | 6.26% | | Benchmarks/Misc/flops-8 | 5.77% | | Benchmarks/Misc/matmul_f64_4x4 | 5.19% | | Benchmarks/MiBench/security-rijndael | 5.15% | | Benchmarks/Misc/flops-6 | 5.10% | | Benchmarks/Olden/tsp | 4.46% | | Benchmarks/MiBench/consumer-lame | 4.28% | | Benchmarks/Misc/flops-5 | 4.27% | | Benchmarks/mafft/pairlocalalign | 4.19% | | Benchmarks/Misc/himenobmtxpa | 4.07% | | Benchmarks/Misc/lowercase | 4.06% | | SPEC/CFP2006/433.milc | 3.99% | | Benchmarks/tramp3d-v4 | 3.79% | | Benchmarks/FreeBench/pifft | 3.66% | | Benchmarks/Ptrdist/ks | 3.21% | | Benchmarks/Adobe-C++/loop_unroll | 3.12% | | SPEC/CINT2000/175.vpr | 3.12% | | Benchmarks/nbench | 2.98% | | SPEC/CFP2000/183.equake | 2.91% | | Benchmarks/Misc/perlin | 2.85% | | Benchmarks/Misc/flops-1 | 2.82% | | Benchmarks/Misc-C++-EH/spirit | 2.80% | | Benchmarks/Misc/flops-2 | 2.77% | | Benchmarks/NPB-serial/is | 2.42% | | Benchmarks/ASC_Sequoia/CrystalMk | 2.33% | | Benchmarks/BenchmarkGame/n-body | 2.28% | | Benchmarks/SciMark2-C/scimark2 | 2.27% | | Benchmarks/Olden/bh | 2.03% | | skidmarks10/skidmarks | 1.81% | | Benchmarks/Misc/flops | 1.72% | Slowdowns: | Benchmarks/llubenchmark/llu | -14.14% | | Benchmarks/Polybench/stencils/seidel-2d | -5.67% | | Benchmarks/Adobe-C++/functionobjects | -5.25% | | Benchmarks/Misc-C++/oopack_v1p8 | -5.00% | | Benchmarks/Shootout/hash | -2.35% | | Benchmarks/Prolangs-C++/ocean | -2.01% | | Benchmarks/Polybench/medley/floyd-warshall | -1.98% | | Polybench/linear-algebra/kernels/3mm | -1.95% | | Benchmarks/McCat/09-vor/vor | -1.68% | llvm-svn: 196516
2013-12-06 01:55:58 +08:00
}
}
}
}
}
class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
/// Current total number of SUs in map.
unsigned NumNodes = 0;
/// 1 for loads, 0 for stores. (see comment in SUList)
unsigned TrueMemOrderLatency;
public:
Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}
/// To keep NumNodes up to date, insert() is used instead of
/// this operator w/ push_back().
ValueType &operator[](const SUList &Key) {
llvm_unreachable("Don't use. Use insert() instead."); };
/// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
/// reduce().
void inline insert(SUnit *SU, ValueType V) {
MapVector::operator[](V).push_back(SU);
NumNodes++;
}
/// Clears the list of SUs mapped to V.
void inline clearList(ValueType V) {
iterator Itr = find(V);
if (Itr != end()) {
assert(NumNodes >= Itr->second.size());
NumNodes -= Itr->second.size();
Itr->second.clear();
}
}
/// Clears map from all contents.
void clear() {
MapVector<ValueType, SUList>::clear();
NumNodes = 0;
}
unsigned inline size() const { return NumNodes; }
/// Counts the number of SUs in this map after a reduction.
void reComputeSize() {
NumNodes = 0;
for (auto &I : *this)
NumNodes += I.second.size();
}
unsigned inline getTrueMemOrderLatency() const {
return TrueMemOrderLatency;
}
void dump();
};
void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
Value2SUsMap &Val2SUsMap) {
for (auto &I : Val2SUsMap)
addChainDependencies(SU, I.second,
Val2SUsMap.getTrueMemOrderLatency());
}
void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
Value2SUsMap &Val2SUsMap,
ValueType V) {
Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
if (Itr != Val2SUsMap.end())
addChainDependencies(SU, Itr->second,
Val2SUsMap.getTrueMemOrderLatency());
}
void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
assert(BarrierChain != nullptr);
for (auto &I : map) {
SUList &sus = I.second;
for (auto *SU : sus)
SU->addPredBarrier(BarrierChain);
}
map.clear();
}
void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
assert(BarrierChain != nullptr);
// Go through all lists of SUs.
for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
Value2SUsMap::iterator CurrItr = I++;
SUList &sus = CurrItr->second;
SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
for (; SUItr != SUEE; ++SUItr) {
// Stop on BarrierChain or any instruction above it.
if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
break;
(*SUItr)->addPredBarrier(BarrierChain);
}
// Remove also the BarrierChain from list if present.
if (SUItr != SUEE && *SUItr == BarrierChain)
SUItr++;
// Remove all SUs that are now successors of BarrierChain.
if (SUItr != sus.begin())
sus.erase(sus.begin(), SUItr);
}
// Remove all entries with empty su lists.
map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
return (mapEntry.second.empty()); });
// Recompute the size of the map (NumNodes).
map.reComputeSize();
}
void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
RegPressureTracker *RPTracker,
PressureDiffs *PDiffs,
LiveIntervals *LIS,
bool TrackLaneMasks) {
const TargetSubtargetInfo &ST = MF.getSubtarget();
bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
: ST.useAA();
AAForDep = UseAA ? AA : nullptr;
BarrierChain = nullptr;
this->TrackLaneMasks = TrackLaneMasks;
MISUnitMap.clear();
ScheduleDAG::clearDAG();
// Create an SUnit for each real instruction.
initSUnits();
if (PDiffs)
PDiffs->init(SUnits.size());
// We build scheduling units by walking a block's instruction list
// from bottom to top.
// Each MIs' memory operand(s) is analyzed to a list of underlying
// objects. The SU is then inserted in the SUList(s) mapped from the
// Value(s). Each Value thus gets mapped to lists of SUs depending
// on it, stores and loads kept separately. Two SUs are trivially
// non-aliasing if they both depend on only identified Values and do
// not share any common Value.
Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
// Certain memory accesses are known to not alias any SU in Stores
// or Loads, and have therefore their own 'NonAlias'
// domain. E.g. spill / reload instructions never alias LLVM I/R
// Values. It would be nice to assume that this type of memory
// accesses always have a proper memory operand modelling, and are
// therefore never unanalyzable, but this is conservatively not
// done.
Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
// Remove any stale debug info; sometimes BuildSchedGraph is called again
// without emitting the info from the previous call.
DbgValues.clear();
FirstDbgValue = nullptr;
assert(Defs.empty() && Uses.empty() &&
"Only BuildGraph should update Defs/Uses");
Defs.setUniverse(TRI->getNumRegs());
Uses.setUniverse(TRI->getNumRegs());
assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
unsigned NumVirtRegs = MRI.getNumVirtRegs();
CurrentVRegDefs.setUniverse(NumVirtRegs);
CurrentVRegUses.setUniverse(NumVirtRegs);
// Model data dependencies between instructions being scheduled and the
// ExitSU.
addSchedBarrierDeps();
// Walk the list of instructions, from bottom moving up.
MachineInstr *DbgMI = nullptr;
for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
MII != MIE; --MII) {
MachineInstr &MI = *std::prev(MII);
if (DbgMI) {
DbgValues.push_back(std::make_pair(DbgMI, &MI));
DbgMI = nullptr;
}
if (MI.isDebugValue()) {
DbgMI = &MI;
continue;
}
if (MI.isDebugLabel())
continue;
SUnit *SU = MISUnitMap[&MI];
assert(SU && "No SUnit mapped to this MI");
if (RPTracker) {
RegisterOperands RegOpers;
RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
if (TrackLaneMasks) {
SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
}
if (PDiffs != nullptr)
PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
if (RPTracker->getPos() == RegionEnd || &*RPTracker->getPos() != &MI)
RPTracker->recedeSkipDebugValues();
assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
RPTracker->recede(RegOpers);
}
assert(
(CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) &&
"Cannot schedule terminators or labels!");
// Add register-based dependencies (data, anti, and output).
// For some instructions (calls, returns, inline-asm, etc.) there can
// be explicit uses and implicit defs, in which case the use will appear
// on the operand list before the def. Do two passes over the operand
// list to make sure that defs are processed before any uses.
bool HasVRegDef = false;
for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
const MachineOperand &MO = MI.getOperand(j);
if (!MO.isReg() || !MO.isDef())
continue;
unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
addPhysRegDeps(SU, j);
} else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
HasVRegDef = true;
addVRegDefDeps(SU, j);
}
}
// Now process all uses.
for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
const MachineOperand &MO = MI.getOperand(j);
// Only look at use operands.
// We do not need to check for MO.readsReg() here because subsequent
// subregister defs will get output dependence edges and need no
// additional use dependencies.
if (!MO.isReg() || !MO.isUse())
continue;
unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
addPhysRegDeps(SU, j);
} else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
addVRegUseDeps(SU, j);
}
}
// If we haven't seen any uses in this scheduling region, create a
// dependence edge to ExitSU to model the live-out latency. This is required
// for vreg defs with no in-region use, and prefetches with no vreg def.
//
// FIXME: NumDataSuccs would be more precise than NumSuccs here. This
// check currently relies on being called before adding chain deps.
if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) {
SDep Dep(SU, SDep::Artificial);
Dep.setLatency(SU->Latency - 1);
ExitSU.addPred(Dep);
}
// Add memory dependencies (Note: isStoreToStackSlot and
// isLoadFromStackSLot are not usable after stack slots are lowered to
// actual addresses).
// This is a barrier event that acts as a pivotal node in the DAG.
if (isGlobalMemoryObject(AA, &MI)) {
// Become the barrier chain.
if (BarrierChain)
BarrierChain->addPredBarrier(SU);
BarrierChain = SU;
LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
<< BarrierChain->NodeNum << ").\n";);
[ScheduleDAGInstrs::buildSchedGraph()] Handling of memory dependecies rewritten. The buildSchedGraph() was in need of reworking as the AA features had been added on top of earlier code. It was very difficult to understand, and buggy. There had been found cases where scheduling dependencies had actually been missed (see r228686). AliasChain, RejectMemNodes, adjustChainDeps() and iterateChainSucc() have been removed. There are instead now just the four maps from Value to SUs, which have been renamed to Stores, Loads, NonAliasStores and NonAliasLoads. An unknown store used to become the AliasChain, but now becomes a store mapped to 'unknownValue' (in Stores). What used to be PendingLoads is instead the list of SUs mapped to 'unknownValue' in Loads. RejectMemNodes and adjustChainDeps() used to be a safety-net for everything. The SU maps were sometimes cleared and SUs were put in RejectMemNodes, where adjustChainDeps() would look. Instead of this, a more straight forward approach is used in maintaining the SU maps without clearing them and simply letting them grow over time. Instead of the cutt-off in adjustChainDeps() search, a reduction of maps will be done if needed (see below). Each SUnit either becomes the BarrierChain, or is put into one of the maps. For each SUnit encountered, all the information about previous ones are still available until a new BarrierChain is set, at which point the maps are cleared. For huge regions, the algorithm becomes slow, therefore the maps will get reduced at a threshold (current default is 1000 nodes), by a fraction (default 1/2). These values can be tuned by use of CL options in case some test case shows that they need to be changed (-dag-maps-huge-region and -dag-maps-reduction-size). There has not been any considerable change observed in output quality or compile time. There may now be more DAG edges inserted than before (i.e. if A->B->C, then A->C is not needed). However, in a comparison run there were fewer total calls to AA, and a somewhat improved compile time, which means this seems to be not a problem. http://reviews.llvm.org/D8705 Reviewers: Hal Finkel, Andy Trick. llvm-svn: 259201
2016-01-30 00:11:18 +08:00
// Add dependencies against everything below it and clear maps.
addBarrierChain(Stores);
addBarrierChain(Loads);
addBarrierChain(NonAliasStores);
addBarrierChain(NonAliasLoads);
continue;
}
// If it's not a store or a variant load, we're done.
if (!MI.mayStore() &&
!(MI.mayLoad() && !MI.isDereferenceableInvariantLoad(AA)))
continue;
// Always add dependecy edge to BarrierChain if present.
if (BarrierChain)
BarrierChain->addPredBarrier(SU);
// Find the underlying objects for MI. The Objs vector is either
// empty, or filled with the Values of memory locations which this
// SU depends on.
UnderlyingObjectsVector Objs;
bool ObjsFound = getUnderlyingObjectsForInstr(&MI, MFI, Objs,
MF.getDataLayout());
if (MI.mayStore()) {
if (!ObjsFound) {
// An unknown store depends on all stores and loads.
addChainDependencies(SU, Stores);
addChainDependencies(SU, NonAliasStores);
addChainDependencies(SU, Loads);
addChainDependencies(SU, NonAliasLoads);
// Map this store to 'UnknownValue'.
Stores.insert(SU, UnknownValue);
} else {
// Add precise dependencies against all previously seen memory
// accesses mapped to the same Value(s).
for (const UnderlyingObject &UnderlObj : Objs) {
ValueType V = UnderlObj.getValue();
bool ThisMayAlias = UnderlObj.mayAlias();
// Add dependencies to previous stores and loads mapped to V.
addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
}
// Update the store map after all chains have been added to avoid adding
// self-loop edge if multiple underlying objects are present.
for (const UnderlyingObject &UnderlObj : Objs) {
ValueType V = UnderlObj.getValue();
bool ThisMayAlias = UnderlObj.mayAlias();
// Map this store to V.
(ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
}
// The store may have dependencies to unanalyzable loads and
// stores.
addChainDependencies(SU, Loads, UnknownValue);
addChainDependencies(SU, Stores, UnknownValue);
}
} else { // SU is a load.
if (!ObjsFound) {
// An unknown load depends on all stores.
addChainDependencies(SU, Stores);
addChainDependencies(SU, NonAliasStores);
Loads.insert(SU, UnknownValue);
} else {
for (const UnderlyingObject &UnderlObj : Objs) {
ValueType V = UnderlObj.getValue();
bool ThisMayAlias = UnderlObj.mayAlias();
// Add precise dependencies against all previously seen stores
// mapping to the same Value(s).
addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
// Map this load to V.
(ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
}
// The load may have dependencies to unanalyzable stores.
addChainDependencies(SU, Stores, UnknownValue);
2011-05-06 03:24:06 +08:00
}
}
// Reduce maps if they grow huge.
if (Stores.size() + Loads.size() >= HugeRegion) {
LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
}
if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
LLVM_DEBUG(
dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
}
}
if (DbgMI)
FirstDbgValue = DbgMI;
Defs.clear();
Uses.clear();
CurrentVRegDefs.clear();
CurrentVRegUses.clear();
}
raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
PSV->printCustom(OS);
return OS;
}
void ScheduleDAGInstrs::Value2SUsMap::dump() {
for (auto &Itr : *this) {
if (Itr.first.is<const Value*>()) {
const Value *V = Itr.first.get<const Value*>();
if (isa<UndefValue>(V))
dbgs() << "Unknown";
else
V->printAsOperand(dbgs());
}
else if (Itr.first.is<const PseudoSourceValue*>())
dbgs() << Itr.first.get<const PseudoSourceValue*>();
else
llvm_unreachable("Unknown Value type.");
dbgs() << " : ";
dumpSUList(Itr.second);
}
}
void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
Value2SUsMap &loads, unsigned N) {
LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump();
dbgs() << "Loading SUnits:\n"; loads.dump());
// Insert all SU's NodeNums into a vector and sort it.
std::vector<unsigned> NodeNums;
NodeNums.reserve(stores.size() + loads.size());
for (auto &I : stores)
for (auto *SU : I.second)
NodeNums.push_back(SU->NodeNum);
for (auto &I : loads)
for (auto *SU : I.second)
NodeNums.push_back(SU->NodeNum);
llvm::sort(NodeNums);
// The N last elements in NodeNums will be removed, and the SU with
// the lowest NodeNum of them will become the new BarrierChain to
// let the not yet seen SUs have a dependency to the removed SUs.
assert(N <= NodeNums.size());
SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
if (BarrierChain) {
// The aliasing and non-aliasing maps reduce independently of each
// other, but share a common BarrierChain. Check if the
// newBarrierChain is above the former one. If it is not, it may
// introduce a loop to use newBarrierChain, so keep the old one.
if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
BarrierChain->addPredBarrier(newBarrierChain);
BarrierChain = newBarrierChain;
LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU("
<< BarrierChain->NodeNum << ").\n";);
}
else
LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU("
<< BarrierChain->NodeNum << ").\n";);
}
else
BarrierChain = newBarrierChain;
insertBarrierChain(stores);
insertBarrierChain(loads);
LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump();
dbgs() << "Loading SUnits:\n"; loads.dump());
}
static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
MachineInstr &MI, bool addToLiveRegs) {
for (MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.readsReg())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
// Things that are available after the instruction are killed by it.
bool IsKill = LiveRegs.available(MRI, Reg);
MO.setIsKill(IsKill);
if (addToLiveRegs)
LiveRegs.addReg(Reg);
}
}
void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n');
LiveRegs.init(*TRI);
LiveRegs.addLiveOuts(MBB);
// Examine block from end to start...
for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
if (MI.isDebugInstr())
continue;
// Update liveness. Registers that are defed but not used in this
// instruction are now dead. Mark register and all subregs as they
// are completely defined.
for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
const MachineOperand &MO = *O;
if (MO.isReg()) {
if (!MO.isDef())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
LiveRegs.removeReg(Reg);
} else if (MO.isRegMask()) {
LiveRegs.removeRegsInMask(MO);
}
}
// If there is a bundle header fix it up first.
if (!MI.isBundled()) {
toggleKills(MRI, LiveRegs, MI, true);
} else {
MachineBasicBlock::instr_iterator First = MI.getIterator();
if (MI.isBundle()) {
toggleKills(MRI, LiveRegs, MI, false);
++First;
}
// Some targets make the (questionable) assumtion that the instructions
// inside the bundle are ordered and consequently only the last use of
// a register inside the bundle can kill it.
MachineBasicBlock::instr_iterator I = std::next(First);
while (I->isBundledWithSucc())
++I;
do {
if (!I->isDebugInstr())
toggleKills(MRI, LiveRegs, *I, true);
--I;
} while(I != First);
}
}
}
void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dumpNodeName(SU);
dbgs() << ": ";
SU.getInstr()->dump();
#endif
}
void ScheduleDAGInstrs::dump() const {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
if (EntrySU.getInstr() != nullptr)
dumpNodeAll(EntrySU);
for (const SUnit &SU : SUnits)
dumpNodeAll(SU);
if (ExitSU.getInstr() != nullptr)
dumpNodeAll(ExitSU);
#endif
}
std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
std::string s;
raw_string_ostream oss(s);
if (SU == &EntrySU)
oss << "<entry>";
else if (SU == &ExitSU)
oss << "<exit>";
else
SU->getInstr()->print(oss, /*SkipOpers=*/true);
return oss.str();
}
/// Return the basic block label. It is not necessarilly unique because a block
/// contains multiple scheduling regions. But it is fine for visualization.
std::string ScheduleDAGInstrs::getDAGName() const {
return "dag." + BB->getFullName();
}
//===----------------------------------------------------------------------===//
// SchedDFSResult Implementation
//===----------------------------------------------------------------------===//
namespace llvm {
/// Internal state used to compute SchedDFSResult.
class SchedDFSImpl {
SchedDFSResult &R;
/// Join DAG nodes into equivalence classes by their subtree.
IntEqClasses SubtreeClasses;
/// List PredSU, SuccSU pairs that represent data edges between subtrees.
std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs;
struct RootData {
unsigned NodeID;
unsigned ParentNodeID; ///< Parent node (member of the parent subtree).
unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
/// children.
RootData(unsigned id): NodeID(id),
ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
unsigned getSparseSetIndex() const { return NodeID; }
};
SparseSet<RootData> RootSet;
public:
SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
RootSet.setUniverse(R.DFSNodeData.size());
}
/// Returns true if this node been visited by the DFS traversal.
///
/// During visitPostorderNode the Node's SubtreeID is assigned to the Node
/// ID. Later, SubtreeID is updated but remains valid.
bool isVisited(const SUnit *SU) const {
return R.DFSNodeData[SU->NodeNum].SubtreeID
!= SchedDFSResult::InvalidSubtreeID;
}
/// Initializes this node's instruction count. We don't need to flag the node
/// visited until visitPostorder because the DAG cannot have cycles.
void visitPreorder(const SUnit *SU) {
R.DFSNodeData[SU->NodeNum].InstrCount =
SU->getInstr()->isTransient() ? 0 : 1;
}
/// Called once for each node after all predecessors are visited. Revisit this
/// node's predecessors and potentially join them now that we know the ILP of
/// the other predecessors.
void visitPostorderNode(const SUnit *SU) {
// Mark this node as the root of a subtree. It may be joined with its
// successors later.
R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
RootData RData(SU->NodeNum);
RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
// If any predecessors are still in their own subtree, they either cannot be
// joined or are large enough to remain separate. If this parent node's
// total instruction count is not greater than a child subtree by at least
// the subtree limit, then try to join it now since splitting subtrees is
// only useful if multiple high-pressure paths are possible.
unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
for (const SDep &PredDep : SU->Preds) {
if (PredDep.getKind() != SDep::Data)
continue;
unsigned PredNum = PredDep.getSUnit()->NodeNum;
if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
joinPredSubtree(PredDep, SU, /*CheckLimit=*/false);
// Either link or merge the TreeData entry from the child to the parent.
if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
// If the predecessor's parent is invalid, this is a tree edge and the
// current node is the parent.
if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
RootSet[PredNum].ParentNodeID = SU->NodeNum;
}
else if (RootSet.count(PredNum)) {
// The predecessor is not a root, but is still in the root set. This
// must be the new parent that it was just joined to. Note that
// RootSet[PredNum].ParentNodeID may either be invalid or may still be
// set to the original parent.
RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
RootSet.erase(PredNum);
}
}
RootSet[SU->NodeNum] = RData;
}
/// Called once for each tree edge after calling visitPostOrderNode on
/// the predecessor. Increment the parent node's instruction count and
/// preemptively join this subtree to its parent's if it is small enough.
void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
R.DFSNodeData[Succ->NodeNum].InstrCount
+= R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
joinPredSubtree(PredDep, Succ);
}
/// Adds a connection for cross edges.
void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
}
/// Sets each node's subtree ID to the representative ID and record
/// connections between trees.
void finalize() {
SubtreeClasses.compress();
R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
assert(SubtreeClasses.getNumClasses() == RootSet.size()
&& "number of roots should match trees");
for (const RootData &Root : RootSet) {
unsigned TreeID = SubtreeClasses[Root.NodeID];
if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID)
R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID];
R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount;
// Note that SubInstrCount may be greater than InstrCount if we joined
// subtrees across a cross edge. InstrCount will be attributed to the
// original parent, while SubInstrCount will be attributed to the joined
// parent.
}
R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
LLVM_DEBUG(dbgs() << " SU(" << Idx << ") in tree "
<< R.DFSNodeData[Idx].SubtreeID << '\n');
}
for (const std::pair<const SUnit*, const SUnit*> &P : ConnectionPairs) {
unsigned PredTree = SubtreeClasses[P.first->NodeNum];
unsigned SuccTree = SubtreeClasses[P.second->NodeNum];
if (PredTree == SuccTree)
continue;
unsigned Depth = P.first->getDepth();
addConnection(PredTree, SuccTree, Depth);
addConnection(SuccTree, PredTree, Depth);
}
}
protected:
/// Joins the predecessor subtree with the successor that is its DFS parent.
/// Applies some heuristics before joining.
bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
bool CheckLimit = true) {
assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
// Check if the predecessor is already joined.
const SUnit *PredSU = PredDep.getSUnit();
unsigned PredNum = PredSU->NodeNum;
if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
return false;
// Four is the magic number of successors before a node is considered a
// pinch point.
unsigned NumDataSucs = 0;
for (const SDep &SuccDep : PredSU->Succs) {
if (SuccDep.getKind() == SDep::Data) {
if (++NumDataSucs >= 4)
return false;
}
}
if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
return false;
R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
SubtreeClasses.join(Succ->NodeNum, PredNum);
return true;
}
/// Called by finalize() to record a connection between trees.
void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
if (!Depth)
return;
do {
SmallVectorImpl<SchedDFSResult::Connection> &Connections =
R.SubtreeConnections[FromTree];
for (SchedDFSResult::Connection &C : Connections) {
if (C.TreeID == ToTree) {
C.Level = std::max(C.Level, Depth);
return;
}
}
Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
FromTree = R.DFSTreeData[FromTree].ParentTreeID;
} while (FromTree != SchedDFSResult::InvalidSubtreeID);
}
};
} // end namespace llvm
namespace {
/// Manage the stack used by a reverse depth-first search over the DAG.
class SchedDAGReverseDFS {
std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
public:
bool isComplete() const { return DFSStack.empty(); }
void follow(const SUnit *SU) {
DFSStack.push_back(std::make_pair(SU, SU->Preds.begin()));
}
void advance() { ++DFSStack.back().second; }
const SDep *backtrack() {
DFSStack.pop_back();
return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
}
const SUnit *getCurr() const { return DFSStack.back().first; }
SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
SUnit::const_pred_iterator getPredEnd() const {
return getCurr()->Preds.end();
}
};
} // end anonymous namespace
static bool hasDataSucc(const SUnit *SU) {
for (const SDep &SuccDep : SU->Succs) {
if (SuccDep.getKind() == SDep::Data &&
!SuccDep.getSUnit()->isBoundaryNode())
return true;
}
return false;
}
/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
/// search from this root.
void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
if (!IsBottomUp)
2017-08-04 06:41:12 +08:00
llvm_unreachable("Top-down ILP metric is unimplemented");
SchedDFSImpl Impl(*this);
for (const SUnit &SU : SUnits) {
if (Impl.isVisited(&SU) || hasDataSucc(&SU))
continue;
SchedDAGReverseDFS DFS;
Impl.visitPreorder(&SU);
DFS.follow(&SU);
while (true) {
// Traverse the leftmost path as far as possible.
while (DFS.getPred() != DFS.getPredEnd()) {
const SDep &PredDep = *DFS.getPred();
DFS.advance();
// Ignore non-data edges.
if (PredDep.getKind() != SDep::Data
|| PredDep.getSUnit()->isBoundaryNode()) {
continue;
}
// An already visited edge is a cross edge, assuming an acyclic DAG.
if (Impl.isVisited(PredDep.getSUnit())) {
Impl.visitCrossEdge(PredDep, DFS.getCurr());
continue;
}
Impl.visitPreorder(PredDep.getSUnit());
DFS.follow(PredDep.getSUnit());
}
// Visit the top of the stack in postorder and backtrack.
const SUnit *Child = DFS.getCurr();
const SDep *PredDep = DFS.backtrack();
Impl.visitPostorderNode(Child);
if (PredDep)
Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
if (DFS.isComplete())
break;
}
}
Impl.finalize();
}
/// The root of the given SubtreeID was just scheduled. For all subtrees
/// connected to this tree, record the depth of the connection so that the
/// nearest connected subtrees can be prioritized.
void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
for (const Connection &C : SubtreeConnections[SubtreeID]) {
SubtreeConnectLevels[C.TreeID] =
std::max(SubtreeConnectLevels[C.TreeID], C.Level);
LLVM_DEBUG(dbgs() << " Tree: " << C.TreeID << " @"
<< SubtreeConnectLevels[C.TreeID] << '\n');
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
OS << InstrCount << " / " << Length << " = ";
if (!Length)
OS << "BADILP";
else
OS << format("%g", ((double)InstrCount / Length));
}
LLVM_DUMP_METHOD void ILPValue::dump() const {
dbgs() << *this << '\n';
}
namespace llvm {
LLVM_DUMP_METHOD
raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
Val.print(OS);
return OS;
}
} // end namespace llvm
#endif