llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h

509 lines
16 KiB
C++

//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include <cassert>
#include <cstdint>
#include <map>
#include <memory>
#include <set>
#include <vector>
namespace llvm {
enum SIScheduleCandReason {
NoCand,
RegUsage,
Latency,
Successor,
Depth,
NodeOrder
};
struct SISchedulerCandidate {
// The reason for this candidate.
SIScheduleCandReason Reason = NoCand;
// Set of reasons that apply to multiple candidates.
uint32_t RepeatReasonSet = 0;
SISchedulerCandidate() = default;
bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
};
class SIScheduleDAGMI;
class SIScheduleBlockCreator;
enum SIScheduleBlockLinkKind {
NoData,
Data
};
class SIScheduleBlock {
SIScheduleDAGMI *DAG;
SIScheduleBlockCreator *BC;
std::vector<SUnit*> SUnits;
std::map<unsigned, unsigned> NodeNum2Index;
std::vector<SUnit*> TopReadySUs;
std::vector<SUnit*> ScheduledSUnits;
/// The top of the unscheduled zone.
IntervalPressure TopPressure;
RegPressureTracker TopRPTracker;
// Pressure: number of said class of registers needed to
// store the live virtual and real registers.
// We do care only of SGPR32 and VGPR32 and do track only virtual registers.
// Pressure of additional registers required inside the block.
std::vector<unsigned> InternalAdditionnalPressure;
// Pressure of input and output registers
std::vector<unsigned> LiveInPressure;
std::vector<unsigned> LiveOutPressure;
// Registers required by the block, and outputs.
// We do track only virtual registers.
// Note that some registers are not 32 bits,
// and thus the pressure is not equal
// to the number of live registers.
std::set<unsigned> LiveInRegs;
std::set<unsigned> LiveOutRegs;
bool Scheduled = false;
bool HighLatencyBlock = false;
std::vector<unsigned> HasLowLatencyNonWaitedParent;
// Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
unsigned ID;
std::vector<SIScheduleBlock*> Preds; // All blocks predecessors.
// All blocks successors, and the kind of link
std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
unsigned NumHighLatencySuccessors = 0;
public:
SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
unsigned ID):
DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
~SIScheduleBlock() = default;
unsigned getID() const { return ID; }
/// Functions for Block construction.
void addUnit(SUnit *SU);
// When all SUs have been added.
void finalizeUnits();
// Add block pred, which has instruction predecessor of SU.
void addPred(SIScheduleBlock *Pred);
void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
getSuccs() const { return Succs; }
unsigned Height; // Maximum topdown path length to block without outputs
unsigned Depth; // Maximum bottomup path length to block without inputs
unsigned getNumHighLatencySuccessors() const {
return NumHighLatencySuccessors;
}
bool isHighLatencyBlock() { return HighLatencyBlock; }
// This is approximative.
// Ideally should take into accounts some instructions (rcp, etc)
// are 4 times slower.
int getCost() { return SUnits.size(); }
// The block Predecessors and Successors must be all registered
// before fastSchedule().
// Fast schedule with no particular requirement.
void fastSchedule();
std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
// Complete schedule that will try to minimize reg pressure and
// low latencies, and will fill liveins and liveouts.
// Needs all MIs to be grouped between BeginBlock and EndBlock.
// The MIs can be moved after the scheduling,
// it is just used to allow correct track of live registers.
void schedule(MachineBasicBlock::iterator BeginBlock,
MachineBasicBlock::iterator EndBlock);
bool isScheduled() { return Scheduled; }
// Needs the block to be scheduled inside
// TODO: find a way to compute it.
std::vector<unsigned> &getInternalAdditionnalRegUsage() {
return InternalAdditionnalPressure;
}
std::set<unsigned> &getInRegs() { return LiveInRegs; }
std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
void printDebug(bool Full);
private:
struct SISchedCandidate : SISchedulerCandidate {
// The best SUnit candidate.
SUnit *SU = nullptr;
unsigned SGPRUsage;
unsigned VGPRUsage;
bool IsLowLatency;
unsigned LowLatencyOffset;
bool HasLowLatencyNonWaitedParent;
SISchedCandidate() = default;
bool isValid() const { return SU; }
// Copy the status of another candidate without changing policy.
void setBest(SISchedCandidate &Best) {
assert(Best.Reason != NoCand && "uninitialized Sched candidate");
SU = Best.SU;
Reason = Best.Reason;
SGPRUsage = Best.SGPRUsage;
VGPRUsage = Best.VGPRUsage;
IsLowLatency = Best.IsLowLatency;
LowLatencyOffset = Best.LowLatencyOffset;
HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
}
};
void undoSchedule();
void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
void releaseSucc(SUnit *SU, SDep *SuccEdge);
// InOrOutBlock: restrict to links pointing inside the block (true),
// or restrict to links pointing outside the block (false).
void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
void nodeScheduled(SUnit *SU);
void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
SUnit* pickNode();
void traceCandidate(const SISchedCandidate &Cand);
void initRegPressure(MachineBasicBlock::iterator BeginBlock,
MachineBasicBlock::iterator EndBlock);
};
struct SIScheduleBlocks {
std::vector<SIScheduleBlock*> Blocks;
std::vector<int> TopDownIndex2Block;
std::vector<int> TopDownBlock2Index;
};
enum SISchedulerBlockCreatorVariant {
LatenciesAlone,
LatenciesGrouped,
LatenciesAlonePlusConsecutive
};
class SIScheduleBlockCreator {
SIScheduleDAGMI *DAG;
// unique_ptr handles freeing memory for us.
std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
std::map<SISchedulerBlockCreatorVariant,
SIScheduleBlocks> Blocks;
std::vector<SIScheduleBlock*> CurrentBlocks;
std::vector<int> Node2CurrentBlock;
// Topological sort
// Maps topological index to the node number.
std::vector<int> TopDownIndex2Block;
std::vector<int> TopDownBlock2Index;
std::vector<int> BottomUpIndex2Block;
// 0 -> Color not given.
// 1 to SUnits.size() -> Reserved group (you should only add elements to them).
// Above -> Other groups.
int NextReservedID;
int NextNonReservedID;
std::vector<int> CurrentColoring;
std::vector<int> CurrentTopDownReservedDependencyColoring;
std::vector<int> CurrentBottomUpReservedDependencyColoring;
public:
SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
SIScheduleBlocks
getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
bool isSUInBlock(SUnit *SU, unsigned ID);
private:
// Give a Reserved color to every high latency.
void colorHighLatenciesAlone();
// Create groups of high latencies with a Reserved color.
void colorHighLatenciesGroups();
// Compute coloring for topdown and bottom traversals with
// different colors depending on dependencies on Reserved colors.
void colorComputeReservedDependencies();
// Give color to all non-colored SUs according to Reserved groups dependencies.
void colorAccordingToReservedDependencies();
// Divides Blocks having no bottom up or top down dependencies on Reserved groups.
// The new colors are computed according to the dependencies on the other blocks
// formed with colorAccordingToReservedDependencies.
void colorEndsAccordingToDependencies();
// Cut groups into groups with SUs in consecutive order (except for Reserved groups).
void colorForceConsecutiveOrderInGroup();
// Merge Constant loads that have all their users into another group to the group.
// (TODO: else if all their users depend on the same group, put them there)
void colorMergeConstantLoadsNextGroup();
// Merge SUs that have all their users into another group to the group
void colorMergeIfPossibleNextGroup();
// Merge SUs that have all their users into another group to the group,
// but only for Reserved groups.
void colorMergeIfPossibleNextGroupOnlyForReserved();
// Merge SUs that have all their users into another group to the group,
// but only if the group is no more than a few SUs.
void colorMergeIfPossibleSmallGroupsToNextGroup();
// Divides Blocks with important size.
// Idea of implementation: attribute new colors depending on topdown and
// bottom up links to other blocks.
void cutHugeBlocks();
// Put in one group all instructions with no users in this scheduling region
// (we'd want these groups be at the end).
void regroupNoUserInstructions();
// Give Reserved color to export instructions
void colorExports();
void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
void topologicalSort();
void scheduleInsideBlocks();
void fillStats();
};
enum SISchedulerBlockSchedulerVariant {
BlockLatencyRegUsage,
BlockRegUsageLatency,
BlockRegUsage
};
class SIScheduleBlockScheduler {
SIScheduleDAGMI *DAG;
SISchedulerBlockSchedulerVariant Variant;
std::vector<SIScheduleBlock*> Blocks;
std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
std::set<unsigned> LiveRegs;
// Num of schedulable unscheduled blocks reading the register.
std::map<unsigned, unsigned> LiveRegsConsumers;
std::vector<unsigned> LastPosHighLatencyParentScheduled;
int LastPosWaitedHighLatency;
std::vector<SIScheduleBlock*> BlocksScheduled;
unsigned NumBlockScheduled;
std::vector<SIScheduleBlock*> ReadyBlocks;
unsigned VregCurrentUsage;
unsigned SregCurrentUsage;
// Currently is only approximation.
unsigned maxVregUsage;
unsigned maxSregUsage;
std::vector<unsigned> BlockNumPredsLeft;
std::vector<unsigned> BlockNumSuccsLeft;
public:
SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
SISchedulerBlockSchedulerVariant Variant,
SIScheduleBlocks BlocksStruct);
~SIScheduleBlockScheduler() = default;
std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
unsigned getVGPRUsage() { return maxVregUsage; }
unsigned getSGPRUsage() { return maxSregUsage; }
private:
struct SIBlockSchedCandidate : SISchedulerCandidate {
// The best Block candidate.
SIScheduleBlock *Block = nullptr;
bool IsHighLatency;
int VGPRUsageDiff;
unsigned NumSuccessors;
unsigned NumHighLatencySuccessors;
unsigned LastPosHighLatParentScheduled;
unsigned Height;
SIBlockSchedCandidate() = default;
bool isValid() const { return Block; }
// Copy the status of another candidate without changing policy.
void setBest(SIBlockSchedCandidate &Best) {
assert(Best.Reason != NoCand && "uninitialized Sched candidate");
Block = Best.Block;
Reason = Best.Reason;
IsHighLatency = Best.IsHighLatency;
VGPRUsageDiff = Best.VGPRUsageDiff;
NumSuccessors = Best.NumSuccessors;
NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
Height = Best.Height;
}
};
bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
SIBlockSchedCandidate &TryCand);
bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
SIBlockSchedCandidate &TryCand);
SIScheduleBlock *pickBlock();
void addLiveRegs(std::set<unsigned> &Regs);
void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
void releaseBlockSuccs(SIScheduleBlock *Parent);
void blockScheduled(SIScheduleBlock *Block);
// Check register pressure change
// by scheduling a block with these LiveIn and LiveOut.
std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
std::set<unsigned> &OutRegs);
void schedule();
};
struct SIScheduleBlockResult {
std::vector<unsigned> SUs;
unsigned MaxSGPRUsage;
unsigned MaxVGPRUsage;
};
class SIScheduler {
SIScheduleDAGMI *DAG;
SIScheduleBlockCreator BlockCreator;
public:
SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
~SIScheduler() = default;
struct SIScheduleBlockResult
scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
SISchedulerBlockSchedulerVariant ScheduleVariant);
};
class SIScheduleDAGMI final : public ScheduleDAGMILive {
const SIInstrInfo *SITII;
const SIRegisterInfo *SITRI;
std::vector<SUnit> SUnitsLinksBackup;
// For moveLowLatencies. After all Scheduling variants are tested.
std::vector<unsigned> ScheduledSUnits;
std::vector<unsigned> ScheduledSUnitsInv;
unsigned VGPRSetID;
unsigned SGPRSetID;
public:
SIScheduleDAGMI(MachineSchedContext *C);
~SIScheduleDAGMI() override;
// Entry point for the schedule.
void schedule() override;
// To init Block's RPTracker.
void initRPTracker(RegPressureTracker &RPTracker) {
RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
}
MachineBasicBlock *getBB() { return BB; }
MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
LiveIntervals *getLIS() { return LIS; }
MachineRegisterInfo *getMRI() { return &MRI; }
const TargetRegisterInfo *getTRI() { return TRI; }
ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
SUnit& getEntrySU() { return EntrySU; }
SUnit& getExitSU() { return ExitSU; }
void restoreSULinksLeft();
template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
_Iterator End,
unsigned &VgprUsage,
unsigned &SgprUsage);
std::set<unsigned> getInRegs() {
std::set<unsigned> InRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
InRegs.insert(RegMaskPair.RegUnit);
}
return InRegs;
}
std::set<unsigned> getOutRegs() {
std::set<unsigned> OutRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
OutRegs.insert(RegMaskPair.RegUnit);
}
return OutRegs;
};
unsigned getVGPRSetID() const { return VGPRSetID; }
unsigned getSGPRSetID() const { return SGPRSetID; }
private:
void topologicalSort();
// After scheduling is done, improve low latency placements.
void moveLowLatencies();
public:
// Some stats for scheduling inside blocks.
std::vector<unsigned> IsLowLatencySU;
std::vector<unsigned> LowLatencyOffset;
std::vector<unsigned> IsHighLatencySU;
// Topological sort
// Maps topological index to the node number.
std::vector<int> TopDownIndex2SU;
std::vector<int> BottomUpIndex2SU;
};
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H