AMDGPU: Don't wait at end of block with a trivial successor

If there is only one successor, and that successor only
has one predecessor the wait can obviously be delayed until
uses or the end of the next block. This avoids code quality
regressions when there are trivial fallthrough blocks inserted
for structurization.

llvm-svn: 297251
This commit is contained in:
Matt Arsenault 2017-03-08 01:06:58 +00:00
parent f561abab56
commit 52d1b62a28
2 changed files with 85 additions and 4 deletions

View File

@ -524,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
}
}
/// Return true if \p MBB has one successor immediately following, and is its
/// only predecessor
static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
if (MBB.succ_size() != 1)
return false;
const MachineBasicBlock *Succ = *MBB.succ_begin();
return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
}
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
// around other non-memory instructions.
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@ -642,8 +652,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
EndPgmBlocks.push_back(&MBB);
}
// Wait for everything at the end of the MBB
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
// Wait for everything at the end of the MBB. If there is only one
// successor, we can defer this until the uses there.
if (!hasTrivialSuccessor(MBB))
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
}
if (HaveScalarStores) {

View File

@ -7,6 +7,15 @@
<4 x i32> addrspace(4)* %flat16) {
ret void
}
define void @single_fallthrough_successor_no_end_block_wait() {
ret void
}
define void @single_branch_successor_not_next_block() {
ret void
}
...
---
@ -21,18 +30,21 @@
# CHECK-LABEL: bb.1:
# CHECK: FLAT_LOAD_DWORD
# CHECK: S_WAITCNT 3952
# CHECK: FLAT_LOAD_DWORDX4
# The first load has no mem operand, so we should assume it accesses the flat
# address space.
# s_waitcnt vmcnt(0) lgkmcnt(0)
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: S_WAITCNT 127
# CHECK-LABEL: bb.2:
# CHECK: FLAT_LOAD_DWORD
# CHECK: S_WAITCNT 3952
# CHECK: FLAT_LOAD_DWORDX4
# One outstand loads access the flat address space.
# s_waitcnt vmcnt(0) lgkmcnt(0)
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: S_WAITCNT 127
name: flat_zero_waitcnt
@ -57,3 +69,60 @@ body: |
%vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
S_ENDPGM
...
---
# There is only a single fallthrough successor block, so there's no
# need to wait immediately.
# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2
# CHECK-NOT: S_WAITCNT
# CHECK: bb.1:
# CHECK-NEXT: V_LSHLREV_B64
# CHECK-NEXT: S_WAITCNT 112
# CHECK-NEXT: FLAT_STORE_DWORD
name: single_fallthrough_successor_no_end_block_wait
body: |
bb.0:
successors: %bb.1
%vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
bb.1:
%vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
...
---
# The block has a single predecessor with a single successor, but it
# is not the next block so it's non-obvious that the wait is not needed.
# CHECK-LABEL: name: single_branch_successor_not_next_block
# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2
# CHECK-NEXT: S_WAITCNT 112
# CHECK: bb.1
# CHECK-NEXT: FLAT_STORE_DWORD
# CHECK-NEXT: S_ENDPGM
# CHECK: bb.2:
# CHECK-NEXT: V_LSHLREV_B64
# CHECK-NEXT: FLAT_STORE_DWORD
name: single_branch_successor_not_next_block
body: |
bb.0:
successors: %bb.2
%vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
S_BRANCH %bb.2
bb.1:
FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
bb.2:
%vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
...