[AMDGPU] Attempt to reschedule withou clustering

We want to have more load/store clustering but we also want
to maintain low register pressure which are oposit targets.
Allow scheduler to reschedule regions without mutations
applied if we hit a register limit.

Differential Revision: https://reviews.llvm.org/D73386
This commit is contained in:
Stanislav Mekhanoshin 2020-01-23 16:18:16 -08:00
parent 97711228fd
commit 53eb0f8c07
3 changed files with 99 additions and 18 deletions

View File

@ -316,13 +316,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()),
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
if (Stage == 0) {
if (Stage == Collect) {
// Just record regions at the first pass.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
return;
@ -348,6 +348,7 @@ void GCNScheduleDAGMILive::schedule() {
ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
RescheduleRegions[RegionIdx] = false;
if (!LIS)
return;
@ -389,20 +390,28 @@ void GCNScheduleDAGMILive::schedule() {
<< MinOccupancy << ".\n");
}
unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
if (PressureAfter.getVGPRNum() > MaxVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs)
RescheduleRegions[RegionIdx] = true;
if (WavesAfter >= MinOccupancy) {
unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
if (WavesAfter > MFI.getMinWavesPerEU() ||
if (Stage == UnclusteredReschedule &&
!PressureAfter.less(ST, PressureBefore)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
} else if (WavesAfter > MFI.getMinWavesPerEU() ||
PressureAfter.less(ST, PressureBefore) ||
(TotalVGPRs >= PressureAfter.getVGPRNum() &&
TotalSGPRs >= PressureAfter.getSGPRNum())) {
!RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
return;
} else {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RescheduleRegions[RegionIdx] = true;
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
@ -532,33 +541,55 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
RescheduleRegions.set();
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
do {
Stage++;
RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;
if (Stage > 1) {
if (Stage > InitialSchedule) {
if (!LIS)
break;
// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom
// to schedule low register pressure blocks.
// Code is partially copied from MachineSchedulerBase::scheduleRegions().
if (!LIS || StartingOccupancy <= MinOccupancy)
break;
if (Stage == UnclusteredReschedule) {
if (RescheduleRegions.none())
continue;
LLVM_DEBUG(dbgs() <<
"Retrying function scheduling without clustering.\n");
}
LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");
if (Stage == ClusteredLowOccupancyReschedule) {
if (StartingOccupancy <= MinOccupancy)
break;
S.setTargetOccupancy(MinOccupancy);
LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");
S.setTargetOccupancy(MinOccupancy);
}
}
if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);
for (auto Region : Regions) {
if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
continue;
RegionBegin = Region.first;
RegionEnd = Region.second;
@ -566,7 +597,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
if (MBB) finishBlock();
MBB = RegionBegin->getParent();
startBlock(MBB);
if (Stage == 1)
if (Stage == InitialSchedule)
computeBlockPressure(MBB);
}
@ -594,5 +625,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
}
finishBlock();
} while (Stage < 2);
if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);
} while (Stage != LastStage);
}

View File

@ -64,6 +64,14 @@ public:
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
enum : unsigned {
Collect,
InitialSchedule,
UnclusteredReschedule,
ClusteredLowOccupancyReschedule,
LastStage = ClusteredLowOccupancyReschedule
};
const GCNSubtarget &ST;
SIMachineFunctionInfo &MFI;
@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
// Records if a region is not yet scheduled, or schedule has been reverted,
// or we generally desire to reschedule it.
BitVector RescheduleRegions;
// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;

View File

@ -0,0 +1,36 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Interleave loads and stores to fit into 9 VGPR limit.
; This requires to avoid load/store clustering.
; GCN: global_load_dwordx4
; GCN: global_store_dwordx4
; GCN: global_load_dwordx4
; GCN: global_store_dwordx4
; GCN: global_load_dwordx4
; GCN: global_store_dwordx4
; GCN: NumVgprs: {{[0-9]$}}
; GCN: ScratchSize: 0{{$}}
define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
%tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
%tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
%tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
%tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
%tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }
attributes #1 = { "amdgpu-num-vgpr"="9" }