[AMDGPU] Iterative scheduling infrastructure + minimal registry scheduler

Differential revision: https://reviews.llvm.org/D31046

llvm-svn: 298368
This commit is contained in:
Valery Pykhtin 2017-03-21 13:15:46 +00:00
parent 044e003203
commit fd4c410f4d
12 changed files with 1764 additions and 3 deletions

View File

@ -22,6 +22,7 @@
#include "SIInstrInfo.h"
#include "SIISelLowering.h"
#include "SIFrameLowering.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@ -317,6 +318,11 @@ public:
/// the given LDS memory size is the only constraint.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
}
bool hasFP16Denormals() const {
return FP64FP16Denormals;
}

View File

@ -24,6 +24,7 @@
#endif
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "R600MachineScheduler.h"
#include "SIMachineScheduler.h"
@ -155,6 +156,20 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
return DAG;
}
static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
return new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
}
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@ -168,6 +183,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
"Run GCN scheduler to maximize occupancy",
createGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry
IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
"Run GCN scheduler to maximize occupancy (experimental)",
createIterativeGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry
GCNMinRegSchedRegistry("gcn-minreg",
"Run GCN iterative scheduler for minimal register usage (experimental)",
createMinRegScheduler);
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.

View File

@ -94,6 +94,9 @@ add_llvm_target(AMDGPUCodeGen
SIShrinkInstructions.cpp
SITypeRewriter.cpp
SIWholeQuadMode.cpp
GCNIterativeScheduler.cpp
GCNMinRegStrategy.cpp
GCNRegPressure.cpp
${GLOBAL_ISEL_BUILD_FILES}
)

View File

@ -0,0 +1,528 @@
//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
//
//===----------------------------------------------------------------------===//
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "SIMachineFunctionInfo.h"
using namespace llvm;
#define DEBUG_TYPE "misched"
namespace llvm {
std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG);
}
// shim accessors for different order containers
static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
return MI;
}
static inline MachineInstr *getMachineInstr(const SUnit *SU) {
return SU->getInstr();
}
static inline MachineInstr *getMachineInstr(const SUnit &SU) {
return SU.getInstr();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
static void printRegion(raw_ostream &OS,
MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
const LiveIntervals *LIS,
unsigned MaxInstNum =
std::numeric_limits<unsigned>::max()) {
auto BB = Begin->getParent();
OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
<< ' ' << BB->getName() << ":\n";
auto I = Begin;
MaxInstNum = std::max(MaxInstNum, 1u);
for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
if (!I->isDebugValue() && LIS)
OS << LIS->getInstructionIndex(*I);
OS << '\t' << *I;
}
if (I != End) {
OS << "\t...\n";
I = std::prev(End);
if (!I->isDebugValue() && LIS)
OS << LIS->getInstructionIndex(*I);
OS << '\t' << *I;
}
if (End != BB->end()) { // print boundary inst if present
OS << "----\n";
if (LIS) OS << LIS->getInstructionIndex(*End) << '\t';
OS << *End;
}
}
LLVM_DUMP_METHOD
static void printLivenessInfo(raw_ostream &OS,
MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
const LiveIntervals *LIS) {
const auto BB = Begin->getParent();
const auto &MRI = BB->getParent()->getRegInfo();
const auto LiveIns = getLiveRegsBefore(*Begin, *LIS);
OS << "LIn RP: ";
getRegPressure(MRI, LiveIns).print(OS);
const auto BottomMI = End == BB->end() ? std::prev(End) : End;
const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS);
OS << "LOt RP: ";
getRegPressure(MRI, LiveOuts).print(OS);
}
LLVM_DUMP_METHOD
void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
const auto &ST = MF.getSubtarget<SISubtarget>();
for (const auto R : Regions) {
OS << "Region to schedule ";
printRegion(OS, R->Begin, R->End, LIS, 1);
printLivenessInfo(OS, R->Begin, R->End, LIS);
OS << "Max RP: ";
R->MaxPressure.print(OS, &ST);
}
}
LLVM_DUMP_METHOD
void GCNIterativeScheduler::printSchedResult(raw_ostream &OS,
const Region *R,
const GCNRegPressure &RP) const {
OS << "\nAfter scheduling ";
printRegion(OS, R->Begin, R->End, LIS);
printSchedRP(OS, R->MaxPressure, RP);
OS << '\n';
}
LLVM_DUMP_METHOD
void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
const GCNRegPressure &Before,
const GCNRegPressure &After) const {
const auto &ST = MF.getSubtarget<SISubtarget>();
OS << "RP before: ";
Before.print(OS, &ST);
OS << "RP after: ";
After.print(OS, &ST);
}
#endif
// DAG builder helper
class GCNIterativeScheduler::BuildDAG {
GCNIterativeScheduler &Sch;
SmallVector<SUnit*, 8> TopRoots;
public:
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
: Sch(_Sch) {
auto BB = R.Begin->getParent();
Sch.BaseClass::startBlock(BB);
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
/*TrackLaneMask*/true);
Sch.Topo.InitDAGTopologicalSorting();
SmallVector<SUnit*, 8> BotRoots;
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
}
~BuildDAG() {
Sch.BaseClass::exitRegion();
Sch.BaseClass::finishBlock();
}
ArrayRef<const SUnit*> getTopRoots() const {
return TopRoots;
}
};
class GCNIterativeScheduler::OverrideLegacyStrategy {
GCNIterativeScheduler &Sch;
Region &Rgn;
std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
GCNRegPressure SaveMaxRP;
public:
OverrideLegacyStrategy(Region &R,
MachineSchedStrategy &OverrideStrategy,
GCNIterativeScheduler &_Sch)
: Sch(_Sch)
, Rgn(R)
, SaveSchedImpl(std::move(_Sch.SchedImpl))
, SaveMaxRP(R.MaxPressure) {
Sch.SchedImpl.reset(&OverrideStrategy);
auto BB = R.Begin->getParent();
Sch.BaseClass::startBlock(BB);
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
}
~OverrideLegacyStrategy() {
Sch.BaseClass::exitRegion();
Sch.BaseClass::finishBlock();
Sch.SchedImpl.release();
Sch.SchedImpl = std::move(SaveSchedImpl);
}
void schedule() {
assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
DEBUG(dbgs() << "\nScheduling ";
printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
Sch.BaseClass::schedule();
// Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
Sch.RegionEnd = Rgn.End;
//assert(Rgn.End == Sch.RegionEnd);
Rgn.Begin = Sch.RegionBegin;
Rgn.MaxPressure.clear();
}
void restoreOrder() {
assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
// DAG SUnits are stored using original region's order
// so just use SUnits as the restoring schedule
Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP);
}
};
// just a stub to make base class happy
class SchedStrategyStub : public MachineSchedStrategy {
public:
bool shouldTrackPressure() const override { return false; }
bool shouldTrackLaneMasks() const override { return false; }
void initialize(ScheduleDAGMI *DAG) override {}
SUnit *pickNode(bool &IsTopNode) override { return nullptr; }
void schedNode(SUnit *SU, bool IsTopNode) override {}
void releaseTopNode(SUnit *SU) override {}
void releaseBottomNode(SUnit *SU) override {}
};
GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S)
: BaseClass(C, llvm::make_unique<SchedStrategyStub>())
, Context(C)
, Strategy(S)
, UPTracker(*LIS) {
}
// returns max pressure for a region
GCNRegPressure
GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End)
const {
// For the purpose of pressure tracking bottom inst of the region should
// be also processed. End is either BB end, BB terminator inst or sched
// boundary inst.
auto const BBEnd = Begin->getParent()->end();
auto const BottomMI = End == BBEnd ? std::prev(End) : End;
// scheduleRegions walks bottom to top, so its likely we just get next
// instruction to track
auto AfterBottomMI = std::next(BottomMI);
if (AfterBottomMI == BBEnd ||
&*AfterBottomMI != UPTracker.getLastTrackedMI()) {
UPTracker.reset(*BottomMI);
} else {
assert(UPTracker.isValid());
}
for (auto I = BottomMI; I != Begin; --I)
UPTracker.recede(*I);
UPTracker.recede(*Begin);
assert(UPTracker.isValid() ||
(dbgs() << "Tracked region ",
printRegion(dbgs(), Begin, End, LIS), false));
return UPTracker.moveMaxPressure();
}
// returns max pressure for a tentative schedule
template <typename Range> GCNRegPressure
GCNIterativeScheduler::getSchedulePressure(const Region &R,
Range &&Schedule) const {
auto const BBEnd = R.Begin->getParent()->end();
GCNUpwardRPTracker RPTracker(*LIS);
if (R.End != BBEnd) {
// R.End points to the boundary instruction but the
// schedule doesn't include it
RPTracker.reset(*R.End);
RPTracker.recede(*R.End);
} else {
// R.End doesn't point to the boundary instruction
RPTracker.reset(*std::prev(BBEnd));
}
for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
RPTracker.recede(*getMachineInstr(*--I));
}
return RPTracker.moveMaxPressure();
}
void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) {
BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs);
if (NumRegionInstrs > 2) {
Regions.push_back(
new (Alloc.Allocate())
Region { Begin, End, NumRegionInstrs,
getRegionPressure(Begin, End), nullptr });
}
}
void GCNIterativeScheduler::schedule() { // overriden
// do nothing
DEBUG(
printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
dbgs() << "Max RP: ";
Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
}
dbgs() << '\n';
);
}
void GCNIterativeScheduler::finalizeSchedule() { // overriden
if (Regions.empty())
return;
switch (Strategy) {
case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
}
}
// Detach schedule from SUnits and interleave it with debug values.
// Returned schedule becomes independent of DAG state.
std::vector<MachineInstr*>
GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
std::vector<MachineInstr*> Res;
Res.reserve(Schedule.size() * 2);
if (FirstDbgValue)
Res.push_back(FirstDbgValue);
const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end();
for (auto SU : Schedule) {
Res.push_back(SU->getInstr());
const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) {
return P.second == SU->getInstr();
});
if (D != DbgE)
Res.push_back(D->first);
}
return Res;
}
void GCNIterativeScheduler::setBestSchedule(Region &R,
ScheduleRef Schedule,
const GCNRegPressure &MaxRP) {
R.BestSchedule.reset(
new TentativeSchedule{ detachSchedule(Schedule), MaxRP });
}
void GCNIterativeScheduler::scheduleBest(Region &R) {
assert(R.BestSchedule.get() && "No schedule specified");
scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure);
R.BestSchedule.reset();
}
// minimal required region scheduler, works for ranges of SUnits*,
// SUnits or MachineIntrs*
template <typename Range>
void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
const GCNRegPressure &MaxRP) {
assert(RegionBegin == R.Begin && RegionEnd == R.End);
assert(LIS != nullptr);
#ifndef NDEBUG
const auto SchedMaxRP = getSchedulePressure(R, Schedule);
#endif
auto BB = R.Begin->getParent();
auto Top = R.Begin;
for (const auto &I : Schedule) {
auto MI = getMachineInstr(I);
if (MI != &*Top) {
BB->remove(MI);
BB->insert(Top, MI);
if (!MI->isDebugValue())
LIS->handleMove(*MI, true);
}
if (!MI->isDebugValue()) {
// Reset read - undef flags and update them later.
for (auto &Op : MI->operands())
if (Op.isReg() && Op.isDef())
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
/*IgnoreDead*/false);
// Adjust liveness and add missing dead+read-undef flags.
auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
}
Top = std::next(MI->getIterator());
}
RegionBegin = getMachineInstr(Schedule.front());
// Schedule consisting of MachineInstr* is considered 'detached'
// and already interleaved with debug values
if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
placeDebugValues();
// Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
//assert(R.End == RegionEnd);
RegionEnd = R.End;
}
R.Begin = RegionBegin;
R.MaxPressure = MaxRP;
#ifndef NDEBUG
const auto RegionMaxRP = getRegionPressure(R);
const auto &ST = MF.getSubtarget<SISubtarget>();
#endif
assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
|| (dbgs() << "Max RP mismatch!!!\n"
"RP for schedule (calculated): ",
SchedMaxRP.print(dbgs(), &ST),
dbgs() << "RP for schedule (reported): ",
MaxRP.print(dbgs(), &ST),
dbgs() << "RP after scheduling: ",
RegionMaxRP.print(dbgs(), &ST),
false));
}
// Sort recorded regions by pressure - highest at the front
void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
const auto &ST = MF.getSubtarget<SISubtarget>();
std::sort(Regions.begin(), Regions.end(),
[&ST, TargetOcc](const Region *R1, const Region *R2) {
return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
});
}
///////////////////////////////////////////////////////////////////////////////
// Legacy MaxOccupancy Strategy
// Tries to increase occupancy applying minreg scheduler for a sequence of
// most demanding regions. Obtained schedules are saved as BestSchedule for a
// region.
// TargetOcc is the best achievable occupancy for a kernel.
// Returns better occupancy on success or current occupancy on fail.
// BestSchedules aren't deleted on fail.
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
const auto &ST = MF.getSubtarget<SISubtarget>();
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n');
auto NewOcc = TargetOcc;
for (auto R : Regions) {
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
break;
DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
BuildDAG DAG(*R, *this);
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto MaxRP = getSchedulePressure(*R, MinSchedule);
DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
if (NewOcc <= Occ)
break;
setBestSchedule(*R, MinSchedule, MaxRP);
}
DEBUG(dbgs() << "New occupancy = " << NewOcc
<< ", prev occupancy = " << Occ << '\n');
return std::max(NewOcc, Occ);
}
void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
bool TryMaximizeOccupancy) {
const auto &ST = MF.getSubtarget<SISubtarget>();
auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
if (TryMaximizeOccupancy && Occ < TgtOcc)
Occ = tryMaximizeOccupancy(TgtOcc);
// This is really weird but for some magic scheduling regions twice
// gives performance improvement
const int NumPasses = Occ < TgtOcc ? 2 : 1;
TgtOcc = std::min(Occ, TgtOcc);
DEBUG(dbgs() << "Scheduling using default scheduler, "
"target occupancy = " << TgtOcc << '\n');
GCNMaxOccupancySchedStrategy LStrgy(Context);
for (int I = 0; I < NumPasses; ++I) {
// running first pass with TargetOccupancy = 0 mimics previous scheduling
// approach and is a performance magic
LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
for (auto R : Regions) {
OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
Ovr.schedule();
const auto RP = getRegionPressure(*R);
DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Minimal Register Strategy
void GCNIterativeScheduler::scheduleMinReg(bool force) {
const auto &ST = MF.getSubtarget<SISubtarget>();
const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
sortRegionsByPressure(TgtOcc);
auto MaxPressure = Regions.front()->MaxPressure;
for (auto R : Regions) {
if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
break;
BuildDAG DAG(*R, *this);
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto RP = getSchedulePressure(*R, MinSchedule);
DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
dbgs() << "\nWarning: Pressure becomes worse after minreg!";
printSchedRP(dbgs(), R->MaxPressure, RP);
});
if (!force && MaxPressure.less(ST, RP, TgtOcc))
break;
scheduleRegion(*R, MinSchedule, RP);
DEBUG(printSchedResult(dbgs(), R, RP));
MaxPressure = RP;
}
}

View File

@ -0,0 +1,118 @@
//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
#include "GCNRegPressure.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
class GCNIterativeScheduler : public ScheduleDAGMILive {
typedef ScheduleDAGMILive BaseClass;
public:
enum StrategyKind {
SCHEDULE_MINREGONLY,
SCHEDULE_MINREGFORCED,
SCHEDULE_LEGACYMAXOCCUPANCY
};
GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S);
void schedule() override;
void enterRegion(MachineBasicBlock *BB,
MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned RegionInstrs) override;
void finalizeSchedule() override;
protected:
typedef ArrayRef<const SUnit*> ScheduleRef;
struct TentativeSchedule {
std::vector<MachineInstr*> Schedule;
GCNRegPressure MaxPressure;
};
struct Region {
// Fields except for BestSchedule are supposed to reflect current IR state
// `const` fields are to emphasize they shouldn't change for any schedule.
MachineBasicBlock::iterator Begin;
// End is either a boundary instruction or end of basic block
const MachineBasicBlock::iterator End;
const unsigned NumRegionInstrs;
GCNRegPressure MaxPressure;
// best schedule for the region so far (not scheduled yet)
std::unique_ptr<TentativeSchedule> BestSchedule;
};
SpecificBumpPtrAllocator<Region> Alloc;
std::vector<Region*> Regions;
MachineSchedContext *Context;
const StrategyKind Strategy;
mutable GCNUpwardRPTracker UPTracker;
class BuildDAG;
class OverrideLegacyStrategy;
template <typename Range>
GCNRegPressure getSchedulePressure(const Region &R,
Range &&Schedule) const;
GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End) const;
GCNRegPressure getRegionPressure(const Region &R) const {
return getRegionPressure(R.Begin, R.End);
}
void setBestSchedule(Region &R,
ScheduleRef Schedule,
const GCNRegPressure &MaxRP = GCNRegPressure());
void scheduleBest(Region &R);
std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const;
void sortRegionsByPressure(unsigned TargetOcc);
template <typename Range>
void scheduleRegion(Region &R, Range &&Schedule,
const GCNRegPressure &MaxRP = GCNRegPressure());
unsigned tryMaximizeOccupancy(unsigned TargetOcc =
std::numeric_limits<unsigned>::max());
void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
void scheduleMinReg(bool force = false);
void printRegions(raw_ostream &OS) const;
void printSchedResult(raw_ostream &OS,
const Region *R,
const GCNRegPressure &RP) const;
void printSchedRP(raw_ostream &OS,
const GCNRegPressure &Before,
const GCNRegPressure &After) const;
};
} // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H

View File

@ -0,0 +1,266 @@
//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ScheduleDAG.h"
using namespace llvm;
#define DEBUG_TYPE "misched"
class GCNMinRegScheduler {
struct Candidate : ilist_node<Candidate> {
const SUnit *SU;
int Priority;
Candidate(const SUnit *SU_, int Priority_ = 0)
: SU(SU_), Priority(Priority_) {}
};
SpecificBumpPtrAllocator<Candidate> Alloc;
typedef simple_ilist<Candidate> Queue;
Queue RQ; // Ready queue
std::vector<unsigned> NumPreds;
bool isScheduled(const SUnit *SU) const {
assert(!SU->isBoundaryNode());
return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max();
}
void setIsScheduled(const SUnit *SU) {
assert(!SU->isBoundaryNode());
NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max();
}
unsigned getNumPreds(const SUnit *SU) const {
assert(!SU->isBoundaryNode());
assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
return NumPreds[SU->NodeNum];
}
unsigned decNumPreds(const SUnit *SU) {
assert(!SU->isBoundaryNode());
assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
return --NumPreds[SU->NodeNum];
}
void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits);
int getReadySuccessors(const SUnit *SU) const;
int getNotReadySuccessors(const SUnit *SU) const;
template <typename Calc>
unsigned findMax(unsigned Num, Calc C);
Candidate* pickCandidate();
void bumpPredsPriority(const SUnit *SchedSU, int Priority);
void releaseSuccessors(const SUnit* SU, int Priority);
public:
std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG);
};
void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
NumPreds.resize(SUnits.size());
for (unsigned I = 0; I < SUnits.size(); ++I)
NumPreds[I] = SUnits[I].NumPredsLeft;
}
int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const {
unsigned NumSchedSuccs = 0;
for (auto SDep : SU->Succs) {
bool wouldBeScheduled = true;
for (auto PDep : SDep.getSUnit()->Preds) {
auto PSU = PDep.getSUnit();
assert(!PSU->isBoundaryNode());
if (PSU != SU && !isScheduled(PSU)) {
wouldBeScheduled = false;
break;
}
}
NumSchedSuccs += wouldBeScheduled ? 1 : 0;
}
return NumSchedSuccs;
}
int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
return SU->Succs.size() - getReadySuccessors(SU);
}
template <typename Calc>
unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
assert(!RQ.empty() && Num <= RQ.size());
typedef decltype(C(*RQ.begin())) T;
T Max = std::numeric_limits<T>::min();
unsigned NumMax = 0;
for (auto I = RQ.begin(); Num; --Num) {
T Cur = C(*I);
if (Cur >= Max) {
if (Cur > Max) {
Max = Cur;
NumMax = 1;
} else
++NumMax;
auto &Cand = *I++;
RQ.remove(Cand);
RQ.push_front(Cand);
continue;
}
++I;
}
return NumMax;
}
GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
do {
unsigned Num = RQ.size();
if (Num == 1) break;
DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
if (Num == 1) break;
DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
<< Num << '\n');
Num = findMax(Num, [=](const Candidate &C) {
auto SU = C.SU;
int Res = getNotReadySuccessors(SU);
DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
<< Res << " successors, metric = " << -Res << '\n');
return -Res;
});
if (Num == 1) break;
DEBUG(dbgs() << "\nSelecting most producing candidate among "
<< Num << '\n');
Num = findMax(Num, [=](const Candidate &C) {
auto SU = C.SU;
auto Res = getReadySuccessors(SU);
DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
<< Res << " successors, metric = " << Res << '\n');
return Res;
});
if (Num == 1) break;
Num = Num ? Num : RQ.size();
DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
<< Num << '\n');
Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
assert(Num == 1);
} while (false);
return &RQ.front();
}
void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
SmallPtrSet<const SUnit*, 32> Set;
for (const auto &S : SchedSU->Succs) {
if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) ||
S.getKind() != SDep::Data)
continue;
for (const auto &P : S.getSUnit()->Preds) {
auto PSU = P.getSUnit();
assert(!PSU->isBoundaryNode());
if (PSU != SchedSU && !isScheduled(PSU)) {
Set.insert(PSU);
}
}
}
SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end());
while (!Worklist.empty()) {
auto SU = Worklist.pop_back_val();
assert(!SU->isBoundaryNode());
for (const auto &P : SU->Preds) {
if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) &&
Set.insert(P.getSUnit()).second)
Worklist.push_back(P.getSUnit());
}
}
DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
<< ")'s non-ready successors of " << Priority
<< " priority in ready queue: ");
const auto SetEnd = Set.end();
for (auto &C : RQ) {
if (Set.find(C.SU) != SetEnd) {
C.Priority = Priority;
DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
}
}
DEBUG(dbgs() << '\n');
}
void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
for (const auto &S : SU->Succs) {
auto SuccSU = S.getSUnit();
if (S.isWeak())
continue;
assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0);
if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0)
RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority));
}
}
std::vector<const SUnit*>
GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG) {
const auto &SUnits = DAG.SUnits;
std::vector<const SUnit*> Schedule;
Schedule.reserve(SUnits.size());
initNumPreds(SUnits);
int StepNo = 0;
for (auto SU : TopRoots) {
RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo));
}
releaseSuccessors(&DAG.EntrySU, StepNo);
while (!RQ.empty()) {
DEBUG(
dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
"Ready queue:";
for (auto &C : RQ)
dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
dbgs() << '\n';
);
auto C = pickCandidate();
assert(C);
RQ.remove(*C);
auto SU = C->SU;
DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
releaseSuccessors(SU, StepNo);
Schedule.push_back(SU);
setIsScheduled(SU);
if (getReadySuccessors(SU) == 0)
bumpPredsPriority(SU, StepNo);
++StepNo;
}
assert(SUnits.size() == Schedule.size());
return Schedule;
}
namespace llvm {
std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG) {
GCNMinRegScheduler S;
return S.schedule(TopRoots, DAG);
}
}

View File

@ -0,0 +1,355 @@
//===------------------------- GCNRegPressure.cpp - -----------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
//
//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
using namespace llvm;
#define DEBUG_TYPE "misched"
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void llvm::printLivesAt(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
dbgs() << "Live regs at " << SI << ": "
<< *LIS.getInstructionFromIndex(SI);
unsigned Num = 0;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
if (MRI.reg_nodbg_empty(Reg))
continue;
const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
bool firstTime = true;
for (const auto &S : LI.subranges()) {
if (!S.liveAt(SI)) continue;
if (firstTime) {
dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo())
<< '\n';
firstTime = false;
}
dbgs() << " " << S << '\n';
++Num;
}
} else if (LI.liveAt(SI)) {
dbgs() << " " << LI << '\n';
++Num;
}
}
if (!Num) dbgs() << " <none>\n";
}
static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2) {
if (S1.size() != S2.size())
return false;
for (const auto &P : S1) {
auto I = S2.find(P.first);
if (I == S2.end() || I->second != P.second)
return false;
}
return true;
}
static GCNRPTracker::LiveRegSet
stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
GCNRPTracker::LiveRegSet Res;
for (const auto &P : LR) {
if (P.second.any())
Res.insert(P);
}
return Res;
}
#endif
///////////////////////////////////////////////////////////////////////////////
// GCNRegPressure
unsigned GCNRegPressure::getRegKind(unsigned Reg,
const MachineRegisterInfo &MRI) {
assert(TargetRegisterInfo::isVirtualRegister(Reg));
const auto RC = MRI.getRegClass(Reg);
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
(RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
(RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
}
void GCNRegPressure::inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
const MachineRegisterInfo &MRI) {
if (NewMask == PrevMask)
return;
int Sign = 1;
if (NewMask < PrevMask) {
std::swap(NewMask, PrevMask);
Sign = -1;
}
#ifndef NDEBUG
const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
#endif
switch (auto Kind = getRegKind(Reg, MRI)) {
case SGPR32:
case VGPR32:
assert(PrevMask.none() && NewMask == MaxMask);
Value[Kind] += Sign;
break;
case SGPR_TUPLE:
case VGPR_TUPLE:
assert(NewMask < MaxMask || NewMask == MaxMask);
assert(PrevMask < NewMask);
Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
if (PrevMask.none()) {
assert(NewMask.any());
Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
}
break;
default: llvm_unreachable("Unknown register kind");
}
}
bool GCNRegPressure::less(const SISubtarget &ST,
const GCNRegPressure& O,
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGRPNum()));
const auto VGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(getVGRPNum()));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
const auto OtherVGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
if (Occ != OtherOcc)
return Occ > OtherOcc;
bool SGPRImportant = SGPROcc < VGPROcc;
const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
// if both pressures disagree on what is more important compare vgprs
if (SGPRImportant != OtherSGPRImportant) {
SGPRImportant = false;
}
// compare large regs pressure
bool SGPRFirst = SGPRImportant;
for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
if (SGPRFirst) {
auto SW = getSGPRTuplesWeight();
auto OtherSW = O.getSGPRTuplesWeight();
if (SW != OtherSW)
return SW < OtherSW;
} else {
auto VW = getVGPRTuplesWeight();
auto OtherVW = O.getVGPRTuplesWeight();
if (VW != OtherVW)
return VW < OtherVW;
}
}
return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
(getVGRPNum() < O.getVGRPNum());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
OS << "VGPRs: " << getVGRPNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
OS << ", SGPRs: " << getSGRPNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
OS << ", LVGPR WT: " << getVGPRTuplesWeight()
<< ", LSGPR WT: " << getSGPRTuplesWeight();
if (ST) OS << " -> Occ: " << getOccupancy(*ST);
OS << '\n';
}
#endif
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
assert(!MRI.reg_nodbg_empty(Reg));
LaneBitmask LiveMask;
const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
for (const auto &S : LI.subranges())
if (S.liveAt(SI)) {
LiveMask |= S.LaneMask;
assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
}
} else if (LI.liveAt(SI)) {
LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
}
return LiveMask;
}
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = TargetRegisterInfo::index2VirtReg(I);
if (MRI.reg_nodbg_empty(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
if (LiveMask.any())
LiveRegs[Reg] = LiveMask;
}
return LiveRegs;
}
void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
MRI = &MI.getParent()->getParent()->getRegInfo();
LiveRegs = getLiveRegsAfter(MI, LIS);
MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
}
LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
assert(MO.isDef() && MO.isReg() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg()));
// We don't rely on read-undef flag because in case of tentative schedule
// tracking it isn't set correctly yet. This works correctly however since
// use mask has been tracked before using LIS.
return MO.getSubReg() == 0 ?
MRI->getMaxLaneMaskForVReg(MO.getReg()) :
MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
}
LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
assert(MO.isUse() && MO.isReg() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg()));
if (auto SubReg = MO.getSubReg())
return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
if (MaxMask.getAsInteger() == 1) // cannot have subregs
return MaxMask;
// For a tentative schedule LIS isn't updated yet but livemask should remain
// the same on any schedule. Subreg defs can be reordered but they all must
// dominate uses anyway.
auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
}
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
LastTrackedMI = &MI;
if (MI.isDebugValue())
return;
// process all defs first to ensure early clobbers are handled correctly
// iterating over operands() to catch implicit defs
for (const auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef() ||
!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
continue;
auto Reg = MO.getReg();
auto &LiveMask = LiveRegs[Reg];
auto PrevMask = LiveMask;
LiveMask &= ~getDefRegMask(MO);
CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
}
// then all uses
for (const auto &MO : MI.uses()) {
if (!MO.isReg() || !MO.readsReg() ||
!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
continue;
auto Reg = MO.getReg();
auto &LiveMask = LiveRegs[Reg];
auto PrevMask = LiveMask;
LiveMask |= getUsedRegMask(MO);
CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
}
MaxPressure = max(MaxPressure, CurPressure);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
const GCNRPTracker::LiveRegSet &TrackedLR,
const TargetRegisterInfo *TRI) {
for (auto const &P : TrackedLR) {
auto I = LISLR.find(P.first);
if (I == LISLR.end()) {
dbgs() << " " << PrintReg(P.first, TRI)
<< ":L" << PrintLaneMask(P.second)
<< " isn't found in LIS reported set\n";
}
else if (I->second != P.second) {
dbgs() << " " << PrintReg(P.first, TRI)
<< " masks doesn't match: LIS reported "
<< PrintLaneMask(I->second)
<< ", tracked "
<< PrintLaneMask(P.second)
<< '\n';
}
}
for (auto const &P : LISLR) {
auto I = TrackedLR.find(P.first);
if (I == TrackedLR.end()) {
dbgs() << " " << PrintReg(P.first, TRI)
<< ":L" << PrintLaneMask(P.second)
<< " isn't found in tracked set\n";
}
}
}
bool GCNUpwardRPTracker::isValid() const {
const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
const auto TrackedLR = stripEmpty(LiveRegs);
if (!isEqual(LISLR, TrackedLR)) {
dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
" LIS reported livesets mismatch:\n";
printLivesAt(SI, LIS, *MRI);
reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo());
return false;
}
auto LISPressure = getRegPressure(*MRI, LISLR);
if (LISPressure != CurPressure) {
dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: ";
CurPressure.print(dbgs());
dbgs() << "LIS rpt: ";
LISPressure.print(dbgs());
return false;
}
return true;
}
#endif

View File

@ -0,0 +1,170 @@
//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#include "AMDGPUSubtarget.h"
#include <limits>
namespace llvm {
struct GCNRegPressure {
enum RegKind {
SGPR32,
SGPR_TUPLE,
VGPR32,
VGPR_TUPLE,
TOTAL_KINDS
};
GCNRegPressure() {
clear();
}
bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
unsigned getSGRPNum() const { return Value[SGPR32]; }
unsigned getVGRPNum() const { return Value[VGPR32]; }
unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
unsigned getOccupancy(const SISubtarget &ST) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
ST.getOccupancyWithNumVGPRs(getVGRPNum()));
}
void inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);
bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
return getOccupancy(ST) > O.getOccupancy(ST);
}
bool less(const SISubtarget &ST, const GCNRegPressure& O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
bool operator==(const GCNRegPressure &O) const {
return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
}
bool operator!=(const GCNRegPressure &O) const {
return !(*this == O);
}
void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
void dump() const { print(dbgs()); }
private:
unsigned Value[TOTAL_KINDS];
static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2);
};
inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
GCNRegPressure Res;
for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I)
Res.Value[I] = std::max(P1.Value[I], P2.Value[I]);
return Res;
}
class GCNRPTracker {
public:
typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
protected:
LiveRegSet LiveRegs;
GCNRegPressure CurPressure, MaxPressure;
const MachineInstr *LastTrackedMI = nullptr;
mutable const MachineRegisterInfo *MRI = nullptr;
GCNRPTracker() {}
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
// returns MaxPressure, resetting it
decltype(MaxPressure) moveMaxPressure() {
auto Res = MaxPressure;
MaxPressure.clear();
return Res;
}
decltype(LiveRegs) moveLiveRegs() {
return std::move(LiveRegs);
}
};
class GCNUpwardRPTracker : public GCNRPTracker {
const LiveIntervals &LIS;
LaneBitmask getDefRegMask(const MachineOperand &MO) const;
LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
public:
GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
// reset tracker to the point just below MI
// filling live regs upon this point using LIS
void reset(const MachineInstr &MI);
// move to the state just above the MI
void recede(const MachineInstr &MI);
// checks whether the tracker's state after receding MI corresponds
// to reported by LIS
bool isValid() const;
};
LaneBitmask getLiveLaneMask(unsigned Reg,
SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
MI.getParent()->getParent()->getRegInfo());
}
inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
MI.getParent()->getParent()->getRegInfo());
}
template <typename Range>
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
Range &&LiveRegs) {
GCNRegPressure Res;
for (const auto &RM : LiveRegs)
Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
return Res;
}
void printLivesAt(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
} // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H

View File

@ -45,8 +45,6 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
if (MF != &DAG->MF)
TargetOccupancy = 0;
MF = &DAG->MF;
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
@ -531,7 +529,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
Stage++;
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
S.TargetOccupancy = MinOccupancy;
S.setTargetOccupancy(MinOccupancy);
MachineBasicBlock *MBB = nullptr;
for (auto Region : Regions) {

View File

@ -55,6 +55,8 @@ public:
SUnit *pickNode(bool &IsTopNode) override;
void initialize(ScheduleDAGMI *DAG) override;
void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};
class GCNScheduleDAGMILive : public ScheduleDAGMILive {

View File

@ -1,4 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
; We expect a two digit VGPR usage here, not a three digit.
; CHECK: NumVgprs: {{[0-9][0-9]$}}

View File

@ -0,0 +1,288 @@
; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
; SI: NumSgprs: {{[1-9]$}}
; SI: NumVgprs: {{[1-9]$}}
; stores may alias loads
; VI: NumSgprs: {{[1-5][0-9]$}}
; VI: NumVgprs: {{[1-3][0-9]$}}
define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
bb:
%adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
%adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
%adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508
%adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772
%adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020
%adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276
%adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540
%adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788
%adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044
%adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308
%adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556
%adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812
%adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076
%adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324
%adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580
%adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844
%adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092
%adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348
%adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612
%adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860
%adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116
%adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380
%adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628
%adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884
%adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148
%adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396
%adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652
%adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916
%adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164
%adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420
%adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684
%adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932
%adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188
%adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452
%adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700
%adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956
%adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220
%adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468
%adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724
%adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988
%adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236
%adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492
%adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756
%adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004
%adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260
%adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524
%adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772
%adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028
%adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292
%adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540
%adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796
%adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060
%adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308
%adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564
%adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828
%adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076
%adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332
%adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596
%adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844
%adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100
%adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364
%adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612
%adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868
%adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132
%adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380
%adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636
%adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900
%adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148
%adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404
%adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668
%adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916
%adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172
%adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436
%adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684
%adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940
%adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204
%adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452
%adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708
%adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972
%adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220
%adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476
%adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740
%adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988
%adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244
%adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508
%adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756
%adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012
%adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276
%adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524
%adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780
%a.0 = load float, float addrspace(3)* %adr.a.0, align 4
%b.0 = load float, float addrspace(3)* %adr.b.0, align 4
%c.0 = load float, float addrspace(3)* %adr.c.0, align 4
%a.1 = load float, float addrspace(3)* %adr.a.1, align 4
%b.1 = load float, float addrspace(3)* %adr.b.1, align 4
%c.1 = load float, float addrspace(3)* %adr.c.1, align 4
%a.2 = load float, float addrspace(3)* %adr.a.2, align 4
%b.2 = load float, float addrspace(3)* %adr.b.2, align 4
%c.2 = load float, float addrspace(3)* %adr.c.2, align 4
%a.3 = load float, float addrspace(3)* %adr.a.3, align 4
%b.3 = load float, float addrspace(3)* %adr.b.3, align 4
%c.3 = load float, float addrspace(3)* %adr.c.3, align 4
%a.4 = load float, float addrspace(3)* %adr.a.4, align 4
%b.4 = load float, float addrspace(3)* %adr.b.4, align 4
%c.4 = load float, float addrspace(3)* %adr.c.4, align 4
%a.5 = load float, float addrspace(3)* %adr.a.5, align 4
%b.5 = load float, float addrspace(3)* %adr.b.5, align 4
%c.5 = load float, float addrspace(3)* %adr.c.5, align 4
%a.6 = load float, float addrspace(3)* %adr.a.6, align 4
%b.6 = load float, float addrspace(3)* %adr.b.6, align 4
%c.6 = load float, float addrspace(3)* %adr.c.6, align 4
%a.7 = load float, float addrspace(3)* %adr.a.7, align 4
%b.7 = load float, float addrspace(3)* %adr.b.7, align 4
%c.7 = load float, float addrspace(3)* %adr.c.7, align 4
%a.8 = load float, float addrspace(3)* %adr.a.8, align 4
%b.8 = load float, float addrspace(3)* %adr.b.8, align 4
%c.8 = load float, float addrspace(3)* %adr.c.8, align 4
%a.9 = load float, float addrspace(3)* %adr.a.9, align 4
%b.9 = load float, float addrspace(3)* %adr.b.9, align 4
%c.9 = load float, float addrspace(3)* %adr.c.9, align 4
%a.10 = load float, float addrspace(3)* %adr.a.10, align 4
%b.10 = load float, float addrspace(3)* %adr.b.10, align 4
%c.10 = load float, float addrspace(3)* %adr.c.10, align 4
%a.11 = load float, float addrspace(3)* %adr.a.11, align 4
%b.11 = load float, float addrspace(3)* %adr.b.11, align 4
%c.11 = load float, float addrspace(3)* %adr.c.11, align 4
%a.12 = load float, float addrspace(3)* %adr.a.12, align 4
%b.12 = load float, float addrspace(3)* %adr.b.12, align 4
%c.12 = load float, float addrspace(3)* %adr.c.12, align 4
%a.13 = load float, float addrspace(3)* %adr.a.13, align 4
%b.13 = load float, float addrspace(3)* %adr.b.13, align 4
%c.13 = load float, float addrspace(3)* %adr.c.13, align 4
%a.14 = load float, float addrspace(3)* %adr.a.14, align 4
%b.14 = load float, float addrspace(3)* %adr.b.14, align 4
%c.14 = load float, float addrspace(3)* %adr.c.14, align 4
%a.15 = load float, float addrspace(3)* %adr.a.15, align 4
%b.15 = load float, float addrspace(3)* %adr.b.15, align 4
%c.15 = load float, float addrspace(3)* %adr.c.15, align 4
%a.16 = load float, float addrspace(3)* %adr.a.16, align 4
%b.16 = load float, float addrspace(3)* %adr.b.16, align 4
%c.16 = load float, float addrspace(3)* %adr.c.16, align 4
%a.17 = load float, float addrspace(3)* %adr.a.17, align 4
%b.17 = load float, float addrspace(3)* %adr.b.17, align 4
%c.17 = load float, float addrspace(3)* %adr.c.17, align 4
%a.18 = load float, float addrspace(3)* %adr.a.18, align 4
%b.18 = load float, float addrspace(3)* %adr.b.18, align 4
%c.18 = load float, float addrspace(3)* %adr.c.18, align 4
%a.19 = load float, float addrspace(3)* %adr.a.19, align 4
%b.19 = load float, float addrspace(3)* %adr.b.19, align 4
%c.19 = load float, float addrspace(3)* %adr.c.19, align 4
%a.20 = load float, float addrspace(3)* %adr.a.20, align 4
%b.20 = load float, float addrspace(3)* %adr.b.20, align 4
%c.20 = load float, float addrspace(3)* %adr.c.20, align 4
%a.21 = load float, float addrspace(3)* %adr.a.21, align 4
%b.21 = load float, float addrspace(3)* %adr.b.21, align 4
%c.21 = load float, float addrspace(3)* %adr.c.21, align 4
%a.22 = load float, float addrspace(3)* %adr.a.22, align 4
%b.22 = load float, float addrspace(3)* %adr.b.22, align 4
%c.22 = load float, float addrspace(3)* %adr.c.22, align 4
%a.23 = load float, float addrspace(3)* %adr.a.23, align 4
%b.23 = load float, float addrspace(3)* %adr.b.23, align 4
%c.23 = load float, float addrspace(3)* %adr.c.23, align 4
%a.24 = load float, float addrspace(3)* %adr.a.24, align 4
%b.24 = load float, float addrspace(3)* %adr.b.24, align 4
%c.24 = load float, float addrspace(3)* %adr.c.24, align 4
%a.25 = load float, float addrspace(3)* %adr.a.25, align 4
%b.25 = load float, float addrspace(3)* %adr.b.25, align 4
%c.25 = load float, float addrspace(3)* %adr.c.25, align 4
%a.26 = load float, float addrspace(3)* %adr.a.26, align 4
%b.26 = load float, float addrspace(3)* %adr.b.26, align 4
%c.26 = load float, float addrspace(3)* %adr.c.26, align 4
%a.27 = load float, float addrspace(3)* %adr.a.27, align 4
%b.27 = load float, float addrspace(3)* %adr.b.27, align 4
%c.27 = load float, float addrspace(3)* %adr.c.27, align 4
%a.28 = load float, float addrspace(3)* %adr.a.28, align 4
%b.28 = load float, float addrspace(3)* %adr.b.28, align 4
%c.28 = load float, float addrspace(3)* %adr.c.28, align 4
%a.29 = load float, float addrspace(3)* %adr.a.29, align 4
%b.29 = load float, float addrspace(3)* %adr.b.29, align 4
%c.29 = load float, float addrspace(3)* %adr.c.29, align 4
%res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0)
%res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1)
%res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2)
%res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3)
%res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4)
%res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5)
%res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6)
%res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7)
%res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8)
%res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9)
%res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10)
%res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11)
%res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12)
%res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13)
%res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14)
%res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15)
%res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16)
%res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17)
%res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18)
%res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19)
%res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20)
%res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21)
%res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22)
%res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23)
%res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24)
%res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25)
%res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26)
%res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27)
%res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28)
%res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29)
%adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0
%adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2
%adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4
%adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6
%adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8
%adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10
%adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12
%adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14
%adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16
%adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18
%adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20
%adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22
%adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24
%adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26
%adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28
%adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30
%adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32
%adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34
%adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36
%adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38
%adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40
%adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42
%adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44
%adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46
%adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48
%adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50
%adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52
%adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54
%adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56
%adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58
store float %res.0, float addrspace(1)* %adr.res.0, align 4
store float %res.1, float addrspace(1)* %adr.res.1, align 4
store float %res.2, float addrspace(1)* %adr.res.2, align 4
store float %res.3, float addrspace(1)* %adr.res.3, align 4
store float %res.4, float addrspace(1)* %adr.res.4, align 4
store float %res.5, float addrspace(1)* %adr.res.5, align 4
store float %res.6, float addrspace(1)* %adr.res.6, align 4
store float %res.7, float addrspace(1)* %adr.res.7, align 4
store float %res.8, float addrspace(1)* %adr.res.8, align 4
store float %res.9, float addrspace(1)* %adr.res.9, align 4
store float %res.10, float addrspace(1)* %adr.res.10, align 4
store float %res.11, float addrspace(1)* %adr.res.11, align 4
store float %res.12, float addrspace(1)* %adr.res.12, align 4
store float %res.13, float addrspace(1)* %adr.res.13, align 4
store float %res.14, float addrspace(1)* %adr.res.14, align 4
store float %res.15, float addrspace(1)* %adr.res.15, align 4
store float %res.16, float addrspace(1)* %adr.res.16, align 4
store float %res.17, float addrspace(1)* %adr.res.17, align 4
store float %res.18, float addrspace(1)* %adr.res.18, align 4
store float %res.19, float addrspace(1)* %adr.res.19, align 4
store float %res.20, float addrspace(1)* %adr.res.20, align 4
store float %res.21, float addrspace(1)* %adr.res.21, align 4
store float %res.22, float addrspace(1)* %adr.res.22, align 4
store float %res.23, float addrspace(1)* %adr.res.23, align 4
store float %res.24, float addrspace(1)* %adr.res.24, align 4
store float %res.25, float addrspace(1)* %adr.res.25, align 4
store float %res.26, float addrspace(1)* %adr.res.26, align 4
store float %res.27, float addrspace(1)* %adr.res.27, align 4
store float %res.28, float addrspace(1)* %adr.res.28, align 4
store float %res.29, float addrspace(1)* %adr.res.29, align 4
ret void
}
declare float @llvm.fmuladd.f32(float, float, float) #0
attributes #0 = { nounwind readnone }