forked from OSchip/llvm-project
[AMDGPU] Merge M0 initializations
Merges equivalent initializations of M0 and hoists them into a common dominator block. Technically the same code can be used with any register, physical or virtual. Differential Revision: https://reviews.llvm.org/D32279 llvm-svn: 301228
This commit is contained in:
parent
610c966a4e
commit
bd5394be3d
|
@ -81,6 +81,11 @@ using namespace llvm;
|
|||
|
||||
#define DEBUG_TYPE "si-fix-sgpr-copies"
|
||||
|
||||
static cl::opt<bool> EnableM0Merge(
|
||||
"amdgpu-enable-merge-m0",
|
||||
cl::desc("Merge and hoist M0 initializations"),
|
||||
cl::init(false));
|
||||
|
||||
namespace {
|
||||
|
||||
class SIFixSGPRCopies : public MachineFunctionPass {
|
||||
|
@ -108,7 +113,7 @@ public:
|
|||
|
||||
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
|
||||
"SI Fix SGPR copies", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
|
||||
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
|
||||
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
|
||||
"SI Fix SGPR copies", false, false)
|
||||
|
||||
|
@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
|
||||
const TargetRegisterInfo *TRI) {
|
||||
DenseSet<MachineBasicBlock*> Visited;
|
||||
template <class UnaryPredicate>
|
||||
bool searchPredecessors(const MachineBasicBlock *MBB,
|
||||
const MachineBasicBlock *CutOff,
|
||||
UnaryPredicate Predicate) {
|
||||
|
||||
if (MBB == CutOff)
|
||||
return false;
|
||||
|
||||
DenseSet<const MachineBasicBlock*> Visited;
|
||||
SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
|
||||
MBB->pred_end());
|
||||
|
||||
while (!Worklist.empty()) {
|
||||
MachineBasicBlock *mbb = Worklist.back();
|
||||
Worklist.pop_back();
|
||||
MachineBasicBlock *MBB = Worklist.pop_back_val();
|
||||
|
||||
if (!Visited.insert(mbb).second)
|
||||
if (!Visited.insert(MBB).second)
|
||||
continue;
|
||||
if (hasTerminatorThatModifiesExec(*mbb, *TRI))
|
||||
if (MBB == CutOff)
|
||||
continue;
|
||||
if (Predicate(MBB))
|
||||
return true;
|
||||
|
||||
Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
|
||||
Worklist.append(MBB->pred_begin(), MBB->pred_end());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
|
||||
const TargetRegisterInfo *TRI) {
|
||||
return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
|
||||
return hasTerminatorThatModifiesExec(*MBB, *TRI); });
|
||||
}
|
||||
|
||||
// Checks if there is potential path From instruction To instruction.
|
||||
// If CutOff is specified and it sits in between of that path we ignore
|
||||
// a higher portion of the path and report it is not reachable.
|
||||
static bool isReachable(const MachineInstr *From,
|
||||
const MachineInstr *To,
|
||||
const MachineBasicBlock *CutOff,
|
||||
MachineDominatorTree &MDT) {
|
||||
// If either From block dominates To block or instructions are in the same
|
||||
// block and From is higher.
|
||||
if (MDT.dominates(From, To))
|
||||
return true;
|
||||
|
||||
const MachineBasicBlock *MBBFrom = From->getParent();
|
||||
const MachineBasicBlock *MBBTo = To->getParent();
|
||||
if (MBBFrom == MBBTo)
|
||||
return false;
|
||||
|
||||
// Instructions are in different blocks, do predecessor search.
|
||||
// We should almost never get here since we do not usually produce M0 stores
|
||||
// other than -1.
|
||||
return searchPredecessors(MBBTo, CutOff, [MBBFrom]
|
||||
(const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
|
||||
}
|
||||
|
||||
// Hoist and merge identical SGPR initializations into a common predecessor.
|
||||
// This is intended to combine M0 initializations, but can work with any
|
||||
// SGPR. A VGPR cannot be processed since we cannot guarantee vector
|
||||
// executioon.
|
||||
static bool hoistAndMergeSGPRInits(unsigned Reg,
|
||||
const MachineRegisterInfo &MRI,
|
||||
MachineDominatorTree &MDT) {
|
||||
// List of inits by immediate value.
|
||||
typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
|
||||
InitListMap Inits;
|
||||
// List of clobbering instructions.
|
||||
SmallVector<MachineInstr*, 8> Clobbers;
|
||||
bool Changed = false;
|
||||
|
||||
for (auto &MI : MRI.def_instructions(Reg)) {
|
||||
MachineOperand *Imm = nullptr;
|
||||
for (auto &MO: MI.operands()) {
|
||||
if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
|
||||
(!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
|
||||
Imm = nullptr;
|
||||
break;
|
||||
} else if (MO.isImm())
|
||||
Imm = &MO;
|
||||
}
|
||||
if (Imm)
|
||||
Inits[Imm->getImm()].push_front(&MI);
|
||||
else
|
||||
Clobbers.push_back(&MI);
|
||||
}
|
||||
|
||||
for (auto &Init : Inits) {
|
||||
auto &Defs = Init.second;
|
||||
|
||||
for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
|
||||
MachineInstr *MI1 = *I1;
|
||||
|
||||
for (auto I2 = std::next(I1); I2 != E; ) {
|
||||
MachineInstr *MI2 = *I2;
|
||||
|
||||
// Check any possible interference
|
||||
auto intereferes = [&](MachineBasicBlock::iterator From,
|
||||
MachineBasicBlock::iterator To) -> bool {
|
||||
|
||||
assert(MDT.dominates(&*To, &*From));
|
||||
|
||||
auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
|
||||
const MachineBasicBlock *MBBFrom = From->getParent();
|
||||
const MachineBasicBlock *MBBTo = To->getParent();
|
||||
bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
|
||||
bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
|
||||
if (!MayClobberFrom && !MayClobberTo)
|
||||
return false;
|
||||
if ((MayClobberFrom && !MayClobberTo) ||
|
||||
(!MayClobberFrom && MayClobberTo))
|
||||
return true;
|
||||
// Both can clobber, this is not an interference only if both are
|
||||
// dominated by Clobber and belong to the same block or if Clobber
|
||||
// properly dominates To, given that To >> From, so it dominates
|
||||
// both and located in a common dominator.
|
||||
return !((MBBFrom == MBBTo &&
|
||||
MDT.dominates(Clobber, &*From) &&
|
||||
MDT.dominates(Clobber, &*To)) ||
|
||||
MDT.properlyDominates(Clobber->getParent(), MBBTo));
|
||||
};
|
||||
|
||||
return (any_of(Clobbers, interferes)) ||
|
||||
(any_of(Inits, [&](InitListMap::value_type &C) {
|
||||
return C.first != Init.first && any_of(C.second, interferes);
|
||||
}));
|
||||
};
|
||||
|
||||
if (MDT.dominates(MI1, MI2)) {
|
||||
if (!intereferes(MI2, MI1)) {
|
||||
DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
|
||||
<< " " << *MI2);
|
||||
MI2->eraseFromParent();
|
||||
Defs.erase(I2++);
|
||||
Changed = true;
|
||||
continue;
|
||||
}
|
||||
} else if (MDT.dominates(MI2, MI1)) {
|
||||
if (!intereferes(MI1, MI2)) {
|
||||
DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
|
||||
<< " " << *MI1);
|
||||
MI1->eraseFromParent();
|
||||
Defs.erase(I1++);
|
||||
Changed = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
|
||||
MI2->getParent());
|
||||
if (!MBB) {
|
||||
++I2;
|
||||
continue;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
|
||||
if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
|
||||
DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
|
||||
<< " " << *MI1 << "and moving from BB#"
|
||||
<< MI2->getParent()->getNumber() << " to BB#"
|
||||
<< I->getParent()->getNumber() << " " << *MI2);
|
||||
I->getParent()->splice(I, MI2->getParent(), MI2);
|
||||
MI1->eraseFromParent();
|
||||
Defs.erase(I1++);
|
||||
Changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
++I2;
|
||||
}
|
||||
++I1;
|
||||
}
|
||||
}
|
||||
|
||||
if (Changed)
|
||||
MRI.clearKillFlags(Reg);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
|||
}
|
||||
}
|
||||
|
||||
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
|
||||
hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
|||
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
|
||||
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
|
||||
|
||||
// M0 has to be reserved so that llvm accepts it as a live-in into a block.
|
||||
reserveRegisterTuples(Reserved, AMDGPU::M0);
|
||||
|
||||
// Reserve the memory aperture registers.
|
||||
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
|
||||
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s
|
||||
|
||||
# GCN: bb.0.entry:
|
||||
# GCN: SI_INIT_M0 -1
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 65536
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 -1
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 65536
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
|
||||
# GCN: bb.1:
|
||||
# GCN: SI_INIT_M0 -1
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
|
||||
# GCN: bb.2:
|
||||
# GCN: SI_INIT_M0 65536
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
|
||||
# GCN: bb.3:
|
||||
# GCN: SI_INIT_M0 3
|
||||
|
||||
# GCN: bb.4:
|
||||
# GCN-NOT: SI_INIT_M0
|
||||
# GCN: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 4
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
|
||||
# GCN: bb.5:
|
||||
# GCN-NOT: SI_INIT_M0
|
||||
# GCN: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 4
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
|
||||
# GCN: bb.6:
|
||||
# GCN: SI_INIT_M0 -1,
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN: SI_INIT_M0 %2
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 %2
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
# GCN-NEXT: SI_INIT_M0 -1
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
|
||||
---
|
||||
name: test
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
noVRegs: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vgpr_32 }
|
||||
- { id: 1, class: vgpr_32 }
|
||||
- { id: 2, class: sreg_32_xm0 }
|
||||
body: |
|
||||
bb.0.entry:
|
||||
successors: %bb.1, %bb.2
|
||||
|
||||
%0 = IMPLICIT_DEF
|
||||
%1 = IMPLICIT_DEF
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 65536, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 65536, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 65536, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
S_CBRANCH_VCCZ %bb.1, implicit undef %vcc
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2:
|
||||
successors: %bb.3
|
||||
SI_INIT_M0 65536, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
S_BRANCH %bb.3
|
||||
|
||||
bb.3:
|
||||
successors: %bb.4, %bb.5
|
||||
S_CBRANCH_VCCZ %bb.4, implicit undef %vcc
|
||||
S_BRANCH %bb.5
|
||||
|
||||
bb.4:
|
||||
successors: %bb.6
|
||||
SI_INIT_M0 3, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 4, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
S_BRANCH %bb.6
|
||||
|
||||
bb.5:
|
||||
successors: %bb.6
|
||||
SI_INIT_M0 3, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 4, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
S_BRANCH %bb.6
|
||||
|
||||
bb.6:
|
||||
successors: %bb.0.entry, %bb.6
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
%2 = IMPLICIT_DEF
|
||||
SI_INIT_M0 %2, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 %2, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
SI_INIT_M0 -1, implicit-def %m0
|
||||
DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
|
||||
S_CBRANCH_VCCZ %bb.6, implicit undef %vcc
|
||||
S_BRANCH %bb.0.entry
|
||||
|
||||
...
|
|
@ -69,19 +69,20 @@ endif:
|
|||
; TOSMEM-NOT: s_m0
|
||||
; TOSMEM: s_add_u32 m0, s7, 0x100
|
||||
; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
; TOSMEM: s_add_u32 m0, s7, 0x200
|
||||
; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
|
||||
; TOSMEM: s_mov_b64 exec,
|
||||
; TOSMEM: s_cbranch_execz
|
||||
; TOSMEM: s_branch
|
||||
|
||||
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
|
||||
; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200
|
||||
; TOSMEM: s_add_u32 m0, s7, 0x200
|
||||
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
|
||||
|
||||
|
||||
|
@ -130,7 +131,7 @@ endif: ; preds = %else, %if
|
|||
; TOSMEM: s_branch
|
||||
|
||||
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
|
||||
; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
|
||||
; TOSMEM: s_add_u32 m0, s3, 0x100
|
||||
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
|
||||
|
||||
; GCN-NOT: v_readlane_b32 m0
|
||||
|
@ -159,13 +160,14 @@ endif:
|
|||
; GCN-LABEL: {{^}}restore_m0_lds:
|
||||
; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
|
||||
; TOSMEM: s_cmp_eq_u32
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
; TOSMEM: s_add_u32 m0, s3, 0x100
|
||||
; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
; TOSMEM: s_add_u32 m0, s3, 0x300
|
||||
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
; TOSMEM: s_cbranch_scc1
|
||||
|
||||
; TOSMEM: s_mov_b32 m0, -1
|
||||
|
@ -178,10 +180,10 @@ endif:
|
|||
|
||||
; TOSMEM: ds_write_b64
|
||||
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
; TOSMEM: s_add_u32 m0, s3, 0x300
|
||||
; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
|
||||
; TOSMEM-NOT: m0
|
||||
; FIXME-TOSMEM-NOT: m0
|
||||
; TOSMEM: s_waitcnt lgkmcnt(0)
|
||||
; TOSMEM-NOT: m0
|
||||
; TOSMEM: s_mov_b32 m0, s0
|
||||
|
|
Loading…
Reference in New Issue