diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 204e39251783..cead53a913d0 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1600,6 +1600,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); + bool Modified = false; + + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to do the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + for (MachineBasicBlock::iterator E = EntryBB.end(); + I != E && (I->isPHI() || I->isMetaInstruction()); ++I) + ; + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + if (ST->hasVscnt()) + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + + Modified = true; + } // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. @@ -1607,7 +1629,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { BlockInfos.insert({MBB, BlockInfo(MBB)}); std::unique_ptr Brackets; - bool Modified = false; bool Repeat; do { Repeat = false; @@ -1707,26 +1728,5 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } - if (!MFI->isEntryFunction()) { - // Wait for any outstanding memory operations that the input registers may - // depend on. We can't track them and it's better to the wait after the - // costly call sequence. - - // TODO: Could insert earlier and schedule more liberally with operations - // that only use caller preserved registers. - MachineBasicBlock &EntryBB = MF.front(); - MachineBasicBlock::iterator I = EntryBB.begin(); - for (MachineBasicBlock::iterator E = EntryBB.end(); - I != E && (I->isPHI() || I->isMetaInstruction()); ++I) - ; - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); - if (ST->hasVscnt()) - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - - Modified = true; - } - return Modified; } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir index a79187a4e387..0665f9647ac0 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir @@ -192,4 +192,17 @@ body: | $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr S_WAITCNT 3952 KILL $vgpr0 + +# Combine preexisting waitcnt with wait added to the start of a non-entry function. + +--- +name: test_waitcnt_preexisting_func_start +body: | + bb.0: + ; GFX9-LABEL: name: test_waitcnt_preexisting_func_start + ; GFX9: S_WAITCNT 0 + ; GFX9-NOT: S_WAITCNT 0 + ; GFX9: S_ENDPGM 0 + S_WAITCNT 0 + S_ENDPGM 0 ...