[AMDGPU] Ignore KILLs when forming clauses

KILL instructions are sometimes present and prevented hard
clauses from being formed.

Fix this by ignoring all meta instructions in clauses.

Differential Revision: https://reviews.llvm.org/D106042
This commit is contained in:
Sebastian Neubauer 2021-07-16 13:15:49 +02:00 committed by Sebastian Neubauer
parent 63bb2d585e
commit bf980930e5
4 changed files with 94 additions and 16 deletions

View File

@ -268,6 +268,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return; return;
} }
if (MI->isMetaInstruction()) {
if (isVerbose())
OutStreamer->emitRawComment(" meta instruction");
return;
}
MCInst TmpInst; MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst); MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst); EmitToStreamer(*OutStreamer, TmpInst);

View File

@ -58,6 +58,8 @@ enum HardClauseType {
// Internal instructions, which are allowed in the middle of a hard clause, // Internal instructions, which are allowed in the middle of a hard clause,
// except for s_waitcnt. // except for s_waitcnt.
HARDCLAUSE_INTERNAL, HARDCLAUSE_INTERNAL,
// Meta instructions that do not result in any ISA like KILL.
HARDCLAUSE_IGNORE,
// Instructions that are not allowed in a hard clause: SALU, export, branch, // Instructions that are not allowed in a hard clause: SALU, export, branch,
// message, GDS, s_waitcnt and anything else not mentioned above. // message, GDS, s_waitcnt and anything else not mentioned above.
HARDCLAUSE_ILLEGAL, HARDCLAUSE_ILLEGAL,
@ -100,6 +102,8 @@ public:
// It's safe to treat the rest as illegal. // It's safe to treat the rest as illegal.
if (MI.getOpcode() == AMDGPU::S_NOP) if (MI.getOpcode() == AMDGPU::S_NOP)
return HARDCLAUSE_INTERNAL; return HARDCLAUSE_INTERNAL;
if (MI.isMetaInstruction())
return HARDCLAUSE_IGNORE;
return HARDCLAUSE_ILLEGAL; return HARDCLAUSE_ILLEGAL;
} }
@ -112,25 +116,25 @@ public:
// The last non-internal instruction in the clause. // The last non-internal instruction in the clause.
MachineInstr *Last = nullptr; MachineInstr *Last = nullptr;
// The length of the clause including any internal instructions in the // The length of the clause including any internal instructions in the
// middle or after the end of the clause. // middle (but not at the end) of the clause.
unsigned Length = 0; unsigned Length = 0;
// Internal instructions at the and of a clause should not be included in
// the clause. Count them in TrailingInternalLength until a new memory
// instruction is added.
unsigned TrailingInternalLength = 0;
// The base operands of *Last. // The base operands of *Last.
SmallVector<const MachineOperand *, 4> BaseOps; SmallVector<const MachineOperand *, 4> BaseOps;
}; };
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
// Get the size of the clause excluding any internal instructions at the if (CI.First == CI.Last)
// end.
unsigned Size =
std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1;
if (Size < 2)
return false; return false;
assert(Size <= 64 && "Hard clause is too long!"); assert(CI.Length <= 64 && "Hard clause is too long!");
auto &MBB = *CI.First->getParent(); auto &MBB = *CI.First->getParent();
auto ClauseMI = auto ClauseMI =
BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE)) BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
.addImm(Size - 1); .addImm(CI.Length - 1);
finalizeBundle(MBB, ClauseMI->getIterator(), finalizeBundle(MBB, ClauseMI->getIterator(),
std::next(CI.Last->getIterator())); std::next(CI.Last->getIterator()));
return true; return true;
@ -168,6 +172,7 @@ public:
if (CI.Length == 64 || if (CI.Length == 64 ||
(CI.Length && Type != HARDCLAUSE_INTERNAL && (CI.Length && Type != HARDCLAUSE_INTERNAL &&
Type != HARDCLAUSE_IGNORE &&
(Type != CI.Type || (Type != CI.Type ||
// Note that we lie to shouldClusterMemOps about the size of the // Note that we lie to shouldClusterMemOps about the size of the
// cluster. When shouldClusterMemOps is called from the machine // cluster. When shouldClusterMemOps is called from the machine
@ -182,14 +187,20 @@ public:
if (CI.Length) { if (CI.Length) {
// Extend the current clause. // Extend the current clause.
++CI.Length; if (Type != HARDCLAUSE_IGNORE) {
if (Type != HARDCLAUSE_INTERNAL) { if (Type == HARDCLAUSE_INTERNAL) {
CI.Last = &MI; ++CI.TrailingInternalLength;
CI.BaseOps = std::move(BaseOps); } else {
++CI.Length;
CI.Length += CI.TrailingInternalLength;
CI.TrailingInternalLength = 0;
CI.Last = &MI;
CI.BaseOps = std::move(BaseOps);
}
} }
} else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
// Start a new clause. // Start a new clause.
CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)}; CI = ClauseInfo{Type, &MI, &MI, 1, 0, std::move(BaseOps)};
} }
} }

View File

@ -8624,10 +8624,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x2
; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX10-SCRATCH-NEXT: ; meta instruction
; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX10-SCRATCH-NEXT: ; meta instruction
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]

View File

@ -34,6 +34,27 @@ body: |
$sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
... ...
---
name: nop3
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
; CHECK-LABEL: name: nop3
; CHECK: liveins: $sgpr0_sgpr1
; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
; CHECK: S_CLAUSE 2
; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK: S_NOP 2
; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK: }
; CHECK: S_NOP 2
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
S_NOP 2
$sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
S_NOP 2
...
--- ---
name: long_clause name: long_clause
tracksRegLiveness: true tracksRegLiveness: true
@ -239,3 +260,43 @@ body: |
$vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
... ...
---
name: kill
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr4
; CHECK-LABEL: name: kill
; CHECK: liveins: $sgpr0_sgpr1, $sgpr4
; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; CHECK: S_CLAUSE 1
; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK: KILL undef renamable $sgpr4
; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK: }
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
KILL undef renamable $sgpr4
$sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
...
---
name: kill2
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
; CHECK-LABEL: name: kill2
; CHECK: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; CHECK: S_CLAUSE 1
; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK: KILL undef renamable $sgpr4
; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK: }
; CHECK: KILL undef renamable $sgpr5
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
KILL undef renamable $sgpr4
$sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
KILL undef renamable $sgpr5
...