forked from OSchip/llvm-project
464 lines
15 KiB
C++
464 lines
15 KiB
C++
//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file defines a hazard recognizer for the SystemZ scheduler.
|
|
//
|
|
// This class is used by the SystemZ scheduling strategy to maintain
|
|
// the state during scheduling, and provide cost functions for
|
|
// scheduling candidates. This includes:
|
|
//
|
|
// * Decoder grouping. A decoder group can maximally hold 3 uops, and
|
|
// instructions that always begin a new group should be scheduled when
|
|
// the current decoder group is empty.
|
|
// * Processor resources usage. It is beneficial to balance the use of
|
|
// resources.
|
|
//
|
|
// A goal is to consider all instructions, also those outside of any
|
|
// scheduling region. Such instructions are "advanced" past and include
|
|
// single instructions before a scheduling region, branches etc.
|
|
//
|
|
// A block that has only one predecessor continues scheduling with the state
|
|
// of it (which may be updated by emitting branches).
|
|
//
|
|
// ===---------------------------------------------------------------------===//
|
|
|
|
#include "SystemZHazardRecognizer.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "machine-scheduler"
|
|
|
|
// This is the limit of processor resource usage at which the
|
|
// scheduler should try to look for other instructions (not using the
|
|
// critical resource).
|
|
static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
|
|
cl::desc("The OOO window for processor "
|
|
"resources during scheduling."),
|
|
cl::init(8));
|
|
|
|
unsigned SystemZHazardRecognizer::
|
|
getNumDecoderSlots(SUnit *SU) const {
|
|
const MCSchedClassDesc *SC = getSchedClass(SU);
|
|
if (!SC->isValid())
|
|
return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.
|
|
|
|
assert((SC->NumMicroOps != 2 || (SC->BeginGroup && !SC->EndGroup)) &&
|
|
"Only cracked instruction can have 2 uops.");
|
|
assert((SC->NumMicroOps < 3 || (SC->BeginGroup && SC->EndGroup)) &&
|
|
"Expanded instructions always group alone.");
|
|
assert((SC->NumMicroOps < 3 || (SC->NumMicroOps % 3 == 0)) &&
|
|
"Expanded instructions fill the group(s).");
|
|
|
|
return SC->NumMicroOps;
|
|
}
|
|
|
|
unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
|
|
unsigned Idx = CurrGroupSize;
|
|
if (GrpCount % 2)
|
|
Idx += 3;
|
|
|
|
if (SU != nullptr && !fitsIntoCurrentGroup(SU)) {
|
|
if (Idx == 1 || Idx == 2)
|
|
Idx = 3;
|
|
else if (Idx == 4 || Idx == 5)
|
|
Idx = 0;
|
|
}
|
|
|
|
return Idx;
|
|
}
|
|
|
|
ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
|
|
getHazardType(SUnit *SU, int Stalls) {
|
|
return (fitsIntoCurrentGroup(SU) ? NoHazard : Hazard);
|
|
}
|
|
|
|
void SystemZHazardRecognizer::Reset() {
|
|
CurrGroupSize = 0;
|
|
CurrGroupHas4RegOps = false;
|
|
clearProcResCounters();
|
|
GrpCount = 0;
|
|
LastFPdOpCycleIdx = UINT_MAX;
|
|
LastEmittedMI = nullptr;
|
|
LLVM_DEBUG(CurGroupDbg = "";);
|
|
}
|
|
|
|
bool
|
|
SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
|
|
const MCSchedClassDesc *SC = getSchedClass(SU);
|
|
if (!SC->isValid())
|
|
return true;
|
|
|
|
// A cracked instruction only fits into schedule if the current
|
|
// group is empty.
|
|
if (SC->BeginGroup)
|
|
return (CurrGroupSize == 0);
|
|
|
|
// An instruction with 4 register operands will not fit in last slot.
|
|
assert ((CurrGroupSize < 2 || !CurrGroupHas4RegOps) &&
|
|
"Current decoder group is already full!");
|
|
if (CurrGroupSize == 2 && has4RegOps(SU->getInstr()))
|
|
return false;
|
|
|
|
// Since a full group is handled immediately in EmitInstruction(),
|
|
// SU should fit into current group. NumSlots should be 1 or 0,
|
|
// since it is not a cracked or expanded instruction.
|
|
assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) &&
|
|
"Expected normal instruction to fit in non-full group!");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const {
|
|
const MachineFunction &MF = *MI->getParent()->getParent();
|
|
const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
|
|
const MCInstrDesc &MID = MI->getDesc();
|
|
unsigned Count = 0;
|
|
for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) {
|
|
const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI, MF);
|
|
if (RC == nullptr)
|
|
continue;
|
|
if (OpIdx >= MID.getNumDefs() &&
|
|
MID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
|
|
continue;
|
|
Count++;
|
|
}
|
|
return Count >= 4;
|
|
}
|
|
|
|
void SystemZHazardRecognizer::nextGroup() {
|
|
if (CurrGroupSize == 0)
|
|
return;
|
|
|
|
LLVM_DEBUG(dumpCurrGroup("Completed decode group"));
|
|
LLVM_DEBUG(CurGroupDbg = "";);
|
|
|
|
int NumGroups = ((CurrGroupSize > 3) ? (CurrGroupSize / 3) : 1);
|
|
assert((CurrGroupSize <= 3 || CurrGroupSize % 3 == 0) &&
|
|
"Current decoder group bad.");
|
|
|
|
// Reset counter for next group.
|
|
CurrGroupSize = 0;
|
|
CurrGroupHas4RegOps = false;
|
|
|
|
GrpCount += ((unsigned) NumGroups);
|
|
|
|
// Decrease counters for execution units.
|
|
for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
|
|
ProcResourceCounters[i] = ((ProcResourceCounters[i] > NumGroups)
|
|
? (ProcResourceCounters[i] - NumGroups)
|
|
: 0);
|
|
|
|
// Clear CriticalResourceIdx if it is now below the threshold.
|
|
if (CriticalResourceIdx != UINT_MAX &&
|
|
(ProcResourceCounters[CriticalResourceIdx] <=
|
|
ProcResCostLim))
|
|
CriticalResourceIdx = UINT_MAX;
|
|
|
|
LLVM_DEBUG(dumpState(););
|
|
}
|
|
|
|
#ifndef NDEBUG // Debug output
|
|
void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
|
|
OS << "SU(" << SU->NodeNum << "):";
|
|
OS << TII->getName(SU->getInstr()->getOpcode());
|
|
|
|
const MCSchedClassDesc *SC = getSchedClass(SU);
|
|
if (!SC->isValid())
|
|
return;
|
|
|
|
for (TargetSchedModel::ProcResIter
|
|
PI = SchedModel->getWriteProcResBegin(SC),
|
|
PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
|
|
const MCProcResourceDesc &PRD =
|
|
*SchedModel->getProcResource(PI->ProcResourceIdx);
|
|
std::string FU(PRD.Name);
|
|
// trim e.g. Z13_FXaUnit -> FXa
|
|
FU = FU.substr(FU.find('_') + 1);
|
|
size_t Pos = FU.find("Unit");
|
|
if (Pos != std::string::npos)
|
|
FU.resize(Pos);
|
|
if (FU == "LS") // LSUnit -> LSU
|
|
FU = "LSU";
|
|
OS << "/" << FU;
|
|
|
|
if (PI->Cycles > 1)
|
|
OS << "(" << PI->Cycles << "cyc)";
|
|
}
|
|
|
|
if (SC->NumMicroOps > 1)
|
|
OS << "/" << SC->NumMicroOps << "uops";
|
|
if (SC->BeginGroup && SC->EndGroup)
|
|
OS << "/GroupsAlone";
|
|
else if (SC->BeginGroup)
|
|
OS << "/BeginsGroup";
|
|
else if (SC->EndGroup)
|
|
OS << "/EndsGroup";
|
|
if (SU->isUnbuffered)
|
|
OS << "/Unbuffered";
|
|
if (has4RegOps(SU->getInstr()))
|
|
OS << "/4RegOps";
|
|
}
|
|
|
|
void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
|
|
dbgs() << "++ " << Msg;
|
|
dbgs() << ": ";
|
|
|
|
if (CurGroupDbg.empty())
|
|
dbgs() << " <empty>\n";
|
|
else {
|
|
dbgs() << "{ " << CurGroupDbg << " }";
|
|
dbgs() << " (" << CurrGroupSize << " decoder slot"
|
|
<< (CurrGroupSize > 1 ? "s":"")
|
|
<< (CurrGroupHas4RegOps ? ", 4RegOps" : "")
|
|
<< ")\n";
|
|
}
|
|
}
|
|
|
|
void SystemZHazardRecognizer::dumpProcResourceCounters() const {
|
|
bool any = false;
|
|
|
|
for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
|
|
if (ProcResourceCounters[i] > 0) {
|
|
any = true;
|
|
break;
|
|
}
|
|
|
|
if (!any)
|
|
return;
|
|
|
|
dbgs() << "++ | Resource counters: ";
|
|
for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
|
|
if (ProcResourceCounters[i] > 0)
|
|
dbgs() << SchedModel->getProcResource(i)->Name
|
|
<< ":" << ProcResourceCounters[i] << " ";
|
|
dbgs() << "\n";
|
|
|
|
if (CriticalResourceIdx != UINT_MAX)
|
|
dbgs() << "++ | Critical resource: "
|
|
<< SchedModel->getProcResource(CriticalResourceIdx)->Name
|
|
<< "\n";
|
|
}
|
|
|
|
void SystemZHazardRecognizer::dumpState() const {
|
|
dumpCurrGroup("| Current decoder group");
|
|
dbgs() << "++ | Current cycle index: "
|
|
<< getCurrCycleIdx() << "\n";
|
|
dumpProcResourceCounters();
|
|
if (LastFPdOpCycleIdx != UINT_MAX)
|
|
dbgs() << "++ | Last FPd cycle index: " << LastFPdOpCycleIdx << "\n";
|
|
}
|
|
|
|
#endif //NDEBUG
|
|
|
|
void SystemZHazardRecognizer::clearProcResCounters() {
|
|
ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0);
|
|
CriticalResourceIdx = UINT_MAX;
|
|
}
|
|
|
|
static inline bool isBranchRetTrap(MachineInstr *MI) {
|
|
return (MI->isBranch() || MI->isReturn() ||
|
|
MI->getOpcode() == SystemZ::CondTrap);
|
|
}
|
|
|
|
// Update state with SU as the next scheduled unit.
|
|
void SystemZHazardRecognizer::
|
|
EmitInstruction(SUnit *SU) {
|
|
const MCSchedClassDesc *SC = getSchedClass(SU);
|
|
LLVM_DEBUG(dbgs() << "++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
|
|
dbgs() << "\n";);
|
|
LLVM_DEBUG(dumpCurrGroup("Decode group before emission"););
|
|
|
|
// If scheduling an SU that must begin a new decoder group, move on
|
|
// to next group.
|
|
if (!fitsIntoCurrentGroup(SU))
|
|
nextGroup();
|
|
|
|
LLVM_DEBUG(raw_string_ostream cgd(CurGroupDbg);
|
|
if (CurGroupDbg.length()) cgd << ", "; dumpSU(SU, cgd););
|
|
|
|
LastEmittedMI = SU->getInstr();
|
|
|
|
// After returning from a call, we don't know much about the state.
|
|
if (SU->isCall) {
|
|
LLVM_DEBUG(dbgs() << "++ Clearing state after call.\n";);
|
|
Reset();
|
|
LastEmittedMI = SU->getInstr();
|
|
return;
|
|
}
|
|
|
|
// Increase counter for execution unit(s).
|
|
for (TargetSchedModel::ProcResIter
|
|
PI = SchedModel->getWriteProcResBegin(SC),
|
|
PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
|
|
// Don't handle FPd together with the other resources.
|
|
if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
|
|
continue;
|
|
int &CurrCounter =
|
|
ProcResourceCounters[PI->ProcResourceIdx];
|
|
CurrCounter += PI->Cycles;
|
|
// Check if this is now the new critical resource.
|
|
if ((CurrCounter > ProcResCostLim) &&
|
|
(CriticalResourceIdx == UINT_MAX ||
|
|
(PI->ProcResourceIdx != CriticalResourceIdx &&
|
|
CurrCounter >
|
|
ProcResourceCounters[CriticalResourceIdx]))) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "++ New critical resource: "
|
|
<< SchedModel->getProcResource(PI->ProcResourceIdx)->Name
|
|
<< "\n";);
|
|
CriticalResourceIdx = PI->ProcResourceIdx;
|
|
}
|
|
}
|
|
|
|
// Make note of an instruction that uses a blocking resource (FPd).
|
|
if (SU->isUnbuffered) {
|
|
LastFPdOpCycleIdx = getCurrCycleIdx(SU);
|
|
LLVM_DEBUG(dbgs() << "++ Last FPd cycle index: " << LastFPdOpCycleIdx
|
|
<< "\n";);
|
|
}
|
|
|
|
// Insert SU into current group by increasing number of slots used
|
|
// in current group.
|
|
CurrGroupSize += getNumDecoderSlots(SU);
|
|
CurrGroupHas4RegOps |= has4RegOps(SU->getInstr());
|
|
unsigned GroupLim = (CurrGroupHas4RegOps ? 2 : 3);
|
|
assert((CurrGroupSize <= GroupLim || CurrGroupSize == getNumDecoderSlots(SU))
|
|
&& "SU does not fit into decoder group!");
|
|
|
|
// Check if current group is now full/ended. If so, move on to next
|
|
// group to be ready to evaluate more candidates.
|
|
if (CurrGroupSize >= GroupLim || SC->EndGroup)
|
|
nextGroup();
|
|
}
|
|
|
|
int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
|
|
const MCSchedClassDesc *SC = getSchedClass(SU);
|
|
if (!SC->isValid())
|
|
return 0;
|
|
|
|
// If SU begins new group, it can either break a current group early
|
|
// or fit naturally if current group is empty (negative cost).
|
|
if (SC->BeginGroup) {
|
|
if (CurrGroupSize)
|
|
return 3 - CurrGroupSize;
|
|
return -1;
|
|
}
|
|
|
|
// Similarly, a group-ending SU may either fit well (last in group), or
|
|
// end the group prematurely.
|
|
if (SC->EndGroup) {
|
|
unsigned resultingGroupSize =
|
|
(CurrGroupSize + getNumDecoderSlots(SU));
|
|
if (resultingGroupSize < 3)
|
|
return (3 - resultingGroupSize);
|
|
return -1;
|
|
}
|
|
|
|
// An instruction with 4 register operands will not fit in last slot.
|
|
if (CurrGroupSize == 2 && has4RegOps(SU->getInstr()))
|
|
return 1;
|
|
|
|
// Most instructions can be placed in any decoder slot.
|
|
return 0;
|
|
}
|
|
|
|
bool SystemZHazardRecognizer::isFPdOpPreferred_distance(SUnit *SU) const {
|
|
assert (SU->isUnbuffered);
|
|
// If this is the first FPd op, it should be scheduled high.
|
|
if (LastFPdOpCycleIdx == UINT_MAX)
|
|
return true;
|
|
// If this is not the first PFd op, it should go into the other side
|
|
// of the processor to use the other FPd unit there. This should
|
|
// generally happen if two FPd ops are placed with 2 other
|
|
// instructions between them (modulo 6).
|
|
unsigned SUCycleIdx = getCurrCycleIdx(SU);
|
|
if (LastFPdOpCycleIdx > SUCycleIdx)
|
|
return ((LastFPdOpCycleIdx - SUCycleIdx) == 3);
|
|
return ((SUCycleIdx - LastFPdOpCycleIdx) == 3);
|
|
}
|
|
|
|
int SystemZHazardRecognizer::
|
|
resourcesCost(SUnit *SU) {
|
|
int Cost = 0;
|
|
|
|
const MCSchedClassDesc *SC = getSchedClass(SU);
|
|
if (!SC->isValid())
|
|
return 0;
|
|
|
|
// For a FPd op, either return min or max value as indicated by the
|
|
// distance to any prior FPd op.
|
|
if (SU->isUnbuffered)
|
|
Cost = (isFPdOpPreferred_distance(SU) ? INT_MIN : INT_MAX);
|
|
// For other instructions, give a cost to the use of the critical resource.
|
|
else if (CriticalResourceIdx != UINT_MAX) {
|
|
for (TargetSchedModel::ProcResIter
|
|
PI = SchedModel->getWriteProcResBegin(SC),
|
|
PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI)
|
|
if (PI->ProcResourceIdx == CriticalResourceIdx)
|
|
Cost = PI->Cycles;
|
|
}
|
|
|
|
return Cost;
|
|
}
|
|
|
|
void SystemZHazardRecognizer::emitInstruction(MachineInstr *MI,
|
|
bool TakenBranch) {
|
|
// Make a temporary SUnit.
|
|
SUnit SU(MI, 0);
|
|
|
|
// Set interesting flags.
|
|
SU.isCall = MI->isCall();
|
|
|
|
const MCSchedClassDesc *SC = SchedModel->resolveSchedClass(MI);
|
|
for (const MCWriteProcResEntry &PRE :
|
|
make_range(SchedModel->getWriteProcResBegin(SC),
|
|
SchedModel->getWriteProcResEnd(SC))) {
|
|
switch (SchedModel->getProcResource(PRE.ProcResourceIdx)->BufferSize) {
|
|
case 0:
|
|
SU.hasReservedResource = true;
|
|
break;
|
|
case 1:
|
|
SU.isUnbuffered = true;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
unsigned GroupSizeBeforeEmit = CurrGroupSize;
|
|
EmitInstruction(&SU);
|
|
|
|
if (!TakenBranch && isBranchRetTrap(MI)) {
|
|
// NT Branch on second slot ends group.
|
|
if (GroupSizeBeforeEmit == 1)
|
|
nextGroup();
|
|
}
|
|
|
|
if (TakenBranch && CurrGroupSize > 0)
|
|
nextGroup();
|
|
|
|
assert ((!MI->isTerminator() || isBranchRetTrap(MI)) &&
|
|
"Scheduler: unhandled terminator!");
|
|
}
|
|
|
|
void SystemZHazardRecognizer::
|
|
copyState(SystemZHazardRecognizer *Incoming) {
|
|
// Current decoder group
|
|
CurrGroupSize = Incoming->CurrGroupSize;
|
|
LLVM_DEBUG(CurGroupDbg = Incoming->CurGroupDbg;);
|
|
|
|
// Processor resources
|
|
ProcResourceCounters = Incoming->ProcResourceCounters;
|
|
CriticalResourceIdx = Incoming->CriticalResourceIdx;
|
|
|
|
// FPd
|
|
LastFPdOpCycleIdx = Incoming->LastFPdOpCycleIdx;
|
|
GrpCount = Incoming->GrpCount;
|
|
}
|