diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 3b932ced7520..ccbcc867215c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -268,6 +268,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (MI->isMetaInstruction()) { + if (isVerbose()) + OutStreamer->emitRawComment(" meta instruction"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index 7ba20eb6027b..125f006a1d1d 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -58,6 +58,8 @@ enum HardClauseType { // Internal instructions, which are allowed in the middle of a hard clause, // except for s_waitcnt. HARDCLAUSE_INTERNAL, + // Meta instructions that do not result in any ISA like KILL. + HARDCLAUSE_IGNORE, // Instructions that are not allowed in a hard clause: SALU, export, branch, // message, GDS, s_waitcnt and anything else not mentioned above. HARDCLAUSE_ILLEGAL, @@ -100,6 +102,8 @@ public: // It's safe to treat the rest as illegal. if (MI.getOpcode() == AMDGPU::S_NOP) return HARDCLAUSE_INTERNAL; + if (MI.isMetaInstruction()) + return HARDCLAUSE_IGNORE; return HARDCLAUSE_ILLEGAL; } @@ -112,25 +116,25 @@ public: // The last non-internal instruction in the clause. MachineInstr *Last = nullptr; // The length of the clause including any internal instructions in the - // middle or after the end of the clause. + // middle (but not at the end) of the clause. unsigned Length = 0; + // Internal instructions at the and of a clause should not be included in + // the clause. Count them in TrailingInternalLength until a new memory + // instruction is added. + unsigned TrailingInternalLength = 0; // The base operands of *Last. SmallVector BaseOps; }; bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { - // Get the size of the clause excluding any internal instructions at the - // end. - unsigned Size = - std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1; - if (Size < 2) + if (CI.First == CI.Last) return false; - assert(Size <= 64 && "Hard clause is too long!"); + assert(CI.Length <= 64 && "Hard clause is too long!"); auto &MBB = *CI.First->getParent(); auto ClauseMI = BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE)) - .addImm(Size - 1); + .addImm(CI.Length - 1); finalizeBundle(MBB, ClauseMI->getIterator(), std::next(CI.Last->getIterator())); return true; @@ -168,6 +172,7 @@ public: if (CI.Length == 64 || (CI.Length && Type != HARDCLAUSE_INTERNAL && + Type != HARDCLAUSE_IGNORE && (Type != CI.Type || // Note that we lie to shouldClusterMemOps about the size of the // cluster. When shouldClusterMemOps is called from the machine @@ -182,14 +187,20 @@ public: if (CI.Length) { // Extend the current clause. - ++CI.Length; - if (Type != HARDCLAUSE_INTERNAL) { - CI.Last = &MI; - CI.BaseOps = std::move(BaseOps); + if (Type != HARDCLAUSE_IGNORE) { + if (Type == HARDCLAUSE_INTERNAL) { + ++CI.TrailingInternalLength; + } else { + ++CI.Length; + CI.Length += CI.TrailingInternalLength; + CI.TrailingInternalLength = 0; + CI.Last = &MI; + CI.BaseOps = std::move(BaseOps); + } } } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { // Start a new clause. - CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)}; + CI = ClauseInfo{Type, &MI, &MI, 1, 0, std::move(BaseOps)}; } } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 6e89bfe3ae02..16022e33b84c 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -8624,10 +8624,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x2 ; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: ; meta instruction +; GFX10-SCRATCH-NEXT: ; meta instruction ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir index 9ff28639f434..78e8ab8fe6c8 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir @@ -34,6 +34,27 @@ body: | $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 ... +--- +name: nop3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: nop3 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 { + ; CHECK: S_CLAUSE 2 + ; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; CHECK: S_NOP 2 + ; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 + ; CHECK: } + ; CHECK: S_NOP 2 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_NOP 2 + $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 + S_NOP 2 +... + --- name: long_clause tracksRegLiveness: true @@ -239,3 +260,43 @@ body: | $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ... + +--- +name: kill +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr4 + ; CHECK-LABEL: name: kill + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr4 + ; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 { + ; CHECK: S_CLAUSE 1 + ; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; CHECK: KILL undef renamable $sgpr4 + ; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 + ; CHECK: } + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + KILL undef renamable $sgpr4 + $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 +... + +--- +name: kill2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5 + ; CHECK-LABEL: name: kill2 + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5 + ; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 { + ; CHECK: S_CLAUSE 1 + ; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; CHECK: KILL undef renamable $sgpr4 + ; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 + ; CHECK: } + ; CHECK: KILL undef renamable $sgpr5 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + KILL undef renamable $sgpr4 + $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 + KILL undef renamable $sgpr5 +...