AMDGPU: Fix debug info handling in post-RA bundler

This was allowing debug instructions to break the bundling, which
would change scheduling behavior. Bundle debug info / kills inside
the bundle. This seems to work OK, although the asm printer doesn't
understand these in a bundle. This implicitly expects the memory
legalizer to unbundle. It would probably be slightly nicer to move
these after.

Rewrite the loop to be clearer and make sure we don't end a bundle on
a meta instruction, only allow them in between other valid bundle
instructions.
This commit is contained in:
Matt Arsenault 2021-02-14 10:46:10 -05:00
parent 3c8bf29f14
commit c320e8196a
3 changed files with 223 additions and 37 deletions

View File

@ -48,10 +48,15 @@ private:
SmallSet<Register, 16> Defs;
bool isBundleCandidate(const MachineInstr &MI) const;
bool isDependentLoad(const MachineInstr &MI) const;
bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
};
constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
SIInstrFlags::SMRD | SIInstrFlags::DS |
SIInstrFlags::FLAT | SIInstrFlags::MIMG;
} // End anonymous namespace.
INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
@ -80,55 +85,73 @@ bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
return false;
}
bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const {
const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags;
return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled();
}
bool SIPostRABundler::canBundle(const MachineInstr &MI,
const MachineInstr &NextMI) const {
const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags;
return (IMemFlags != 0 && MI.mayLoadOrStore() && !NextMI.isBundled() &&
NextMI.mayLoad() == MI.mayLoad() && NextMI.mayStore() == MI.mayStore() &&
((NextMI.getDesc().TSFlags & MemFlags) == IMemFlags) &&
!isDependentLoad(NextMI));
}
bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
bool Changed = false;
const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
SIInstrFlags::SMRD | SIInstrFlags::DS |
SIInstrFlags::FLAT | SIInstrFlags::MIMG;
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::instr_iterator Next;
MachineBasicBlock::instr_iterator B = MBB.instr_begin();
MachineBasicBlock::instr_iterator E = MBB.instr_end();
for (auto I = B; I != E; I = Next) {
Next = std::next(I);
const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags;
if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() ||
B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
((B->getDesc().TSFlags & MemFlags) != IMemFlags) ||
isDependentLoad(*I)) {
if (B != I) {
if (std::next(B) != I) {
finalizeBundle(MBB, B, I);
Changed = true;
}
Next = I;
}
B = Next;
Defs.clear();
if (!isBundleCandidate(*I))
continue;
assert(Defs.empty());
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
MachineBasicBlock::instr_iterator BundleStart = I;
MachineBasicBlock::instr_iterator BundleEnd = I;
unsigned ClauseLength = 1;
for (I = Next; I != E; I = Next) {
Next = std::next(I);
assert(BundleEnd != I);
if (canBundle(*BundleEnd, *I)) {
BundleEnd = I;
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
++ClauseLength;
} else if (!I->isMetaInstruction()) {
// Allow meta instructions in between bundle candidates, but do not
// start or end a bundle on one.
//
// TODO: It may be better to move meta instructions like dbg_value
// after the bundle. We're relying on the memory legalizer to unbundle
// these.
break;
}
}
if (I->getNumExplicitDefs() == 0)
continue;
Next = std::next(BundleEnd);
if (ClauseLength > 1) {
Changed = true;
finalizeBundle(MBB, BundleStart, Next);
}
Defs.insert(I->defs().begin()->getReg());
Defs.clear();
}
if (B != E && std::next(B) != E) {
finalizeBundle(MBB, B, E);
Changed = true;
}
Defs.clear();
}
return Changed;

View File

@ -0,0 +1,55 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+xnack -amdgpu-max-memory-clause=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Test the behavior of the post-RA soft clause bundler in the presence
; of debug info. The debug info should not interfere with the
; bundling, which could result in an observable codegen change.
define amdgpu_kernel void @dbg_clause(float addrspace(1)* %out, float addrspace(1)* %aptr) !dbg !4 {
; GCN-LABEL: dbg_clause:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dword v1, v0, s[2:3]
; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1
; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32
; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, v1, v2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %gep0, i32 8
%a = load float, float addrspace(1)* %gep0, align 4
call void @llvm.dbg.value(metadata float %a, metadata !8, metadata !DIExpression()), !dbg !9
%b = load float, float addrspace(1)* %gep1, align 4
call void @llvm.dbg.value(metadata float %b, metadata !10, metadata !DIExpression()), !dbg !11
%fadd = fadd float %a, %b
store float %fadd, float addrspace(1)* %out.gep, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.dbg.value(metadata, metadata, metadata) #1
attributes #0 = { nounwind readnone speculatable willreturn }
attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2, !3}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
!2 = !{i32 2, !"Dwarf Version", i32 4}
!3 = !{i32 2, !"Debug Info Version", i32 3}
!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
!5 = !DISubroutineType(types: !6)
!6 = !{null, !7}
!7 = !DIBasicType(name: "float", size: 32, align: 32)
!8 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1)
!9 = !DILocation(line: 1, column: 42, scope: !4)
!10 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 2)
!11 = !DILocation(line: 2, column: 42, scope: !4)

View File

@ -56,12 +56,12 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: }
; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
; GCN: }
; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
; GCN: }
; GCN: S_NOP 0
; GCN: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0
@ -112,3 +112,111 @@ body: |
$vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
$vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
...
# Middle dbg_value should be bundled
---
name: bundle_dbg_value_0
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-LABEL: name: bundle_dbg_value_0
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
; GCN: DBG_VALUE internal $vgpr0, 0, 0
; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
; GCN: }
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
DBG_VALUE $vgpr0, 0, 0
$vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
...
# Middle dbg_value should be bundled
---
name: bundle_dbg_value_1
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN-LABEL: name: bundle_dbg_value_1
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
; GCN: DBG_VALUE internal $vgpr0, 0, 0
; GCN: DBG_VALUE $vgpr1, 0, 0
; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
; GCN: }
; GCN: DBG_VALUE $vgpr2, 0, 0
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
DBG_VALUE $vgpr0, 0, 0
DBG_VALUE $vgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
DBG_VALUE $vgpr2, 0, 0
...
# Starting and ending dbg_values should not be in the bundle
---
name: bundle_dbg_value_2
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN-LABEL: name: bundle_dbg_value_2
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN: DBG_VALUE $vgpr1, 0, 0
; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
; GCN: DBG_VALUE internal $vgpr0, 0, 0
; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
; GCN: }
; GCN: DBG_VALUE $vgpr2, 0, 0
DBG_VALUE $vgpr1, 0, 0
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
DBG_VALUE $vgpr0, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
DBG_VALUE $vgpr2, 0, 0
...
---
name: bundle_kill
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-LABEL: name: bundle_kill
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
; GCN: KILL $vgpr1
; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
; GCN: }
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
KILL $vgpr1
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
...
---
name: bundle_kill_def_in_bundle
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-LABEL: name: bundle_kill_def_in_bundle
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
; GCN: KILL internal $vgpr0
; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
; GCN: }
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
KILL $vgpr0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
...