[AMDGPU] Merge M0 initializations

Merges equivalent initializations of M0 and hoists them into a common
dominator block. Technically the same code can be used with any
register, physical or virtual.

Differential Revision: https://reviews.llvm.org/D32279

llvm-svn: 301228
This commit is contained in:
Stanislav Mekhanoshin 2017-04-24 19:37:54 +00:00
parent 610c966a4e
commit bd5394be3d
4 changed files with 323 additions and 19 deletions

View File

@ -81,6 +81,11 @@ using namespace llvm;
#define DEBUG_TYPE "si-fix-sgpr-copies"
static cl::opt<bool> EnableM0Merge(
"amdgpu-enable-merge-m0",
cl::desc("Merge and hoist M0 initializations"),
cl::init(false));
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
@ -108,7 +113,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
return true;
}
static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
const TargetRegisterInfo *TRI) {
DenseSet<MachineBasicBlock*> Visited;
template <class UnaryPredicate>
bool searchPredecessors(const MachineBasicBlock *MBB,
const MachineBasicBlock *CutOff,
UnaryPredicate Predicate) {
if (MBB == CutOff)
return false;
DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
MBB->pred_end());
while (!Worklist.empty()) {
MachineBasicBlock *mbb = Worklist.back();
Worklist.pop_back();
MachineBasicBlock *MBB = Worklist.pop_back_val();
if (!Visited.insert(mbb).second)
if (!Visited.insert(MBB).second)
continue;
if (hasTerminatorThatModifiesExec(*mbb, *TRI))
if (MBB == CutOff)
continue;
if (Predicate(MBB))
return true;
Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
Worklist.append(MBB->pred_begin(), MBB->pred_end());
}
return false;
}
static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
const TargetRegisterInfo *TRI) {
return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
return hasTerminatorThatModifiesExec(*MBB, *TRI); });
}
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
static bool isReachable(const MachineInstr *From,
const MachineInstr *To,
const MachineBasicBlock *CutOff,
MachineDominatorTree &MDT) {
// If either From block dominates To block or instructions are in the same
// block and From is higher.
if (MDT.dominates(From, To))
return true;
const MachineBasicBlock *MBBFrom = From->getParent();
const MachineBasicBlock *MBBTo = To->getParent();
if (MBBFrom == MBBTo)
return false;
// Instructions are in different blocks, do predecessor search.
// We should almost never get here since we do not usually produce M0 stores
// other than -1.
return searchPredecessors(MBBTo, CutOff, [MBBFrom]
(const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
}
// Hoist and merge identical SGPR initializations into a common predecessor.
// This is intended to combine M0 initializations, but can work with any
// SGPR. A VGPR cannot be processed since we cannot guarantee vector
// executioon.
static bool hoistAndMergeSGPRInits(unsigned Reg,
const MachineRegisterInfo &MRI,
MachineDominatorTree &MDT) {
// List of inits by immediate value.
typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
InitListMap Inits;
// List of clobbering instructions.
SmallVector<MachineInstr*, 8> Clobbers;
bool Changed = false;
for (auto &MI : MRI.def_instructions(Reg)) {
MachineOperand *Imm = nullptr;
for (auto &MO: MI.operands()) {
if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
(!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
Imm = nullptr;
break;
} else if (MO.isImm())
Imm = &MO;
}
if (Imm)
Inits[Imm->getImm()].push_front(&MI);
else
Clobbers.push_back(&MI);
}
for (auto &Init : Inits) {
auto &Defs = Init.second;
for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
MachineInstr *MI1 = *I1;
for (auto I2 = std::next(I1); I2 != E; ) {
MachineInstr *MI2 = *I2;
// Check any possible interference
auto intereferes = [&](MachineBasicBlock::iterator From,
MachineBasicBlock::iterator To) -> bool {
assert(MDT.dominates(&*To, &*From));
auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
const MachineBasicBlock *MBBFrom = From->getParent();
const MachineBasicBlock *MBBTo = To->getParent();
bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
if (!MayClobberFrom && !MayClobberTo)
return false;
if ((MayClobberFrom && !MayClobberTo) ||
(!MayClobberFrom && MayClobberTo))
return true;
// Both can clobber, this is not an interference only if both are
// dominated by Clobber and belong to the same block or if Clobber
// properly dominates To, given that To >> From, so it dominates
// both and located in a common dominator.
return !((MBBFrom == MBBTo &&
MDT.dominates(Clobber, &*From) &&
MDT.dominates(Clobber, &*To)) ||
MDT.properlyDominates(Clobber->getParent(), MBBTo));
};
return (any_of(Clobbers, interferes)) ||
(any_of(Inits, [&](InitListMap::value_type &C) {
return C.first != Init.first && any_of(C.second, interferes);
}));
};
if (MDT.dominates(MI1, MI2)) {
if (!intereferes(MI2, MI1)) {
DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
<< " " << *MI2);
MI2->eraseFromParent();
Defs.erase(I2++);
Changed = true;
continue;
}
} else if (MDT.dominates(MI2, MI1)) {
if (!intereferes(MI1, MI2)) {
DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
<< " " << *MI1);
MI1->eraseFromParent();
Defs.erase(I1++);
Changed = true;
break;
}
} else {
auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
MI2->getParent());
if (!MBB) {
++I2;
continue;
}
MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
<< " " << *MI1 << "and moving from BB#"
<< MI2->getParent()->getNumber() << " to BB#"
<< I->getParent()->getNumber() << " " << *MI2);
I->getParent()->splice(I, MI2->getParent(), MI2);
MI1->eraseFromParent();
Defs.erase(I1++);
Changed = true;
break;
}
}
++I2;
}
++I1;
}
}
if (Changed)
MRI.clearKillFlags(Reg);
return Changed;
}
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
return true;
}

View File

@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
// M0 has to be reserved so that llvm accepts it as a live-in into a block.
reserveRegisterTuples(Reserved, AMDGPU::M0);
// Reserve the memory aperture registers.
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);

View File

@ -0,0 +1,132 @@
# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s
# GCN: bb.0.entry:
# GCN: SI_INIT_M0 -1
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 65536
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 -1
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 65536
# GCN-NEXT: DS_WRITE_B32
# GCN: bb.1:
# GCN: SI_INIT_M0 -1
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: DS_WRITE_B32
# GCN: bb.2:
# GCN: SI_INIT_M0 65536
# GCN-NEXT: DS_WRITE_B32
# GCN: bb.3:
# GCN: SI_INIT_M0 3
# GCN: bb.4:
# GCN-NOT: SI_INIT_M0
# GCN: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 4
# GCN-NEXT: DS_WRITE_B32
# GCN: bb.5:
# GCN-NOT: SI_INIT_M0
# GCN: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 4
# GCN-NEXT: DS_WRITE_B32
# GCN: bb.6:
# GCN: SI_INIT_M0 -1,
# GCN-NEXT: DS_WRITE_B32
# GCN: SI_INIT_M0 %2
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 %2
# GCN-NEXT: DS_WRITE_B32
# GCN-NEXT: SI_INIT_M0 -1
# GCN-NEXT: DS_WRITE_B32
---
name: test
alignment: 0
exposesReturnsTwice: false
noVRegs: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: sreg_32_xm0 }
body: |
bb.0.entry:
successors: %bb.1, %bb.2
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 65536, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 65536, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 65536, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
S_CBRANCH_VCCZ %bb.1, implicit undef %vcc
S_BRANCH %bb.2
bb.1:
successors: %bb.2
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.2
bb.2:
successors: %bb.3
SI_INIT_M0 65536, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.3
bb.3:
successors: %bb.4, %bb.5
S_CBRANCH_VCCZ %bb.4, implicit undef %vcc
S_BRANCH %bb.5
bb.4:
successors: %bb.6
SI_INIT_M0 3, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 4, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.6
bb.5:
successors: %bb.6
SI_INIT_M0 3, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 4, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.6
bb.6:
successors: %bb.0.entry, %bb.6
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
%2 = IMPLICIT_DEF
SI_INIT_M0 %2, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 %2, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
SI_INIT_M0 -1, implicit-def %m0
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
S_CBRANCH_VCCZ %bb.6, implicit undef %vcc
S_BRANCH %bb.0.entry
...

View File

@ -69,19 +69,20 @@ endif:
; TOSMEM-NOT: s_m0
; TOSMEM: s_add_u32 m0, s7, 0x100
; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
; TOSMEM-NOT: m0
; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
; FIXME-TOSMEM-NOT: m0
; TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s7, 0x200
; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
; TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_mov_b64 exec,
; TOSMEM: s_cbranch_execz
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200
; TOSMEM: s_add_u32 m0, s7, 0x200
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
@ -130,7 +131,7 @@ endif: ; preds = %else, %if
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
; GCN-NOT: v_readlane_b32 m0
@ -159,13 +160,14 @@ endif:
; GCN-LABEL: {{^}}restore_m0_lds:
; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
; TOSMEM: s_cmp_eq_u32
; TOSMEM-NOT: m0
; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
; TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x300
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
; TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_cbranch_scc1
; TOSMEM: s_mov_b32 m0, -1
@ -178,10 +180,10 @@ endif:
; TOSMEM: ds_write_b64
; TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x300
; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
; TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_waitcnt lgkmcnt(0)
; TOSMEM-NOT: m0
; TOSMEM: s_mov_b32 m0, s0