diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index d4371ef4a6ca..052db5f6ea71 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -239,7 +239,16 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, for (auto LiveIn : MBB.liveins()) SplitBB->addLiveIn(LiveIn); MBB.addSuccessor(SplitBB); - MDT->addNewBlock(SplitBB, &MBB); + + // Update dominator tree + using DomTreeT = DomTreeBase; + SmallVector DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); + MDT->getBase().applyUpdates(DTUpdates); } MBB.addSuccessor(EarlyExitBlock); @@ -447,6 +456,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { break; } + case AMDGPU::SI_KILL_CLEANUP: + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && + dominatesAllReachable(MBB)) { + KillInstrs.push_back(&MI); + } else { + MI.eraseFromParent(); + } + break; + default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2b053f8dc95e..ec378379ca92 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -379,6 +379,9 @@ multiclass PseudoInstKill { defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; +let Defs = [EXEC] in +def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; + let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 1e90e6ba5418..36d52ac3ee89 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -89,8 +89,10 @@ private: MachineRegisterInfo *MRI = nullptr; SetVector LoweredEndCf; DenseSet LoweredIf; + SmallSet NeedsKillCleanup; const TargetRegisterClass *BoolRC = nullptr; + bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -111,6 +113,8 @@ private: void combineMasks(MachineInstr &MI); + void process(MachineInstr &MI); + // Skip to the next instruction, ignoring debug instructions, and trivial // block boundaries (blocks that have one (typically fallthrough) successor, // and the successor has one predecessor. @@ -160,8 +164,28 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, - const SIInstrInfo *TII) { +static bool hasKill(const MachineBasicBlock *Begin, + const MachineBasicBlock *End, const SIInstrInfo *TII) { + DenseSet Visited; + SmallVector Worklist(Begin->succ_begin(), + Begin->succ_end()); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + + if (MBB == End || !Visited.insert(MBB).second) + continue; + for (auto &Term : MBB->terminators()) + if (TII->isKillTerminator(Term.getOpcode())) + return true; + + Worklist.append(MBB->succ_begin(), MBB->succ_end()); + } + + return false; +} + +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { Register SaveExecReg = MI.getOperand(0).getReg(); auto U = MRI->use_instr_nodbg_begin(SaveExecReg); @@ -170,26 +194,6 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, U->getOpcode() != AMDGPU::SI_END_CF) return false; - // Check for SI_KILL_*_TERMINATOR on path from if to endif. - // if there is any such terminator simplififcations are not safe. - auto SMBB = MI.getParent(); - auto EMBB = U->getParent(); - DenseSet Visited; - SmallVector Worklist(SMBB->succ_begin(), - SMBB->succ_end()); - - while (!Worklist.empty()) { - MachineBasicBlock *MBB = Worklist.pop_back_val(); - - if (MBB == EMBB || !Visited.insert(MBB).second) - continue; - for(auto &Term : MBB->terminators()) - if (TII->isKillTerminator(Term.getOpcode())) - return false; - - Worklist.append(MBB->succ_begin(), MBB->succ_end()); - } - return true; } @@ -207,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI, TII); + bool SimpleIf = isSimpleIf(MI, MRI); + + if (InsertKillCleanups) { + // Check for SI_KILL_*_TERMINATOR on full path of control flow and + // flag the associated SI_END_CF for insertion of a kill cleanup. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { + assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); + assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); + MachineOperand &NextExec = UseMI->getOperand(0); + Register NextExecReg = NextExec.getReg(); + if (NextExec.isDead()) { + assert(!SimpleIf); + break; + } + UseMI = MRI->use_instr_nodbg_begin(NextExecReg); + } + if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { + if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { + NeedsKillCleanup.insert(&*UseMI); + SimpleIf = false; + } + } + } else if (SimpleIf) { + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplifications are not safe. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); + } // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. @@ -427,6 +459,8 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( auto E = B->end(); for ( ; It != E; ++It) { + if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) + continue; if (TII->mayReadEXEC(*MRI, *It)) break; } @@ -461,8 +495,18 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { LoweredEndCf.insert(NewMI); - if (LIS) + // If this ends control flow which contains kills (as flagged in emitIf) + // then insert an SI_KILL_CLEANUP immediately following the exec mask + // manipulation. This can be lowered to early termination if appropriate. + MachineInstr *CleanUpMI = nullptr; + if (NeedsKillCleanup.count(&MI)) + CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); + + if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + if (CleanUpMI) + LIS->InsertMachineInstrInMaps(*CleanUpMI); + } MI.eraseFromParent(); @@ -553,6 +597,56 @@ void SILowerControlFlow::optimizeEndCf() { } } +void SILowerControlFlow::process(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::iterator I(MI); + MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr; + + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + default: + assert(false && "Attempt to process unsupported instruction"); + break; + } + + MachineBasicBlock::iterator Next; + for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MaskMI = *I; + switch (MaskMI.getOpcode()) { + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + // Cleanup bit manipulations on exec mask + combineMasks(MaskMI); + break; + default: + I = MBB.end(); + break; + } + } +} + bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -562,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); + InsertKillCleanups = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -583,62 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { Exec = AMDGPU::EXEC; } + SmallVector Worklist; + MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next, Last; - - for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; switch (MI.getOpcode()) { case AMDGPU::SI_IF: - emitIf(MI); + process(MI); break; case AMDGPU::SI_ELSE: - emitElse(MI); - break; - case AMDGPU::SI_IF_BREAK: - emitIfBreak(MI); - break; - case AMDGPU::SI_LOOP: - emitLoop(MI); - break; - case AMDGPU::SI_END_CF: - emitEndCf(MI); + // Only build worklist if SI_IF instructions must be processed first. + if (InsertKillCleanups) + Worklist.push_back(&MI); + else + process(MI); break; - case AMDGPU::S_AND_B64: - case AMDGPU::S_OR_B64: - case AMDGPU::S_AND_B32: - case AMDGPU::S_OR_B32: - // Cleanup bit manipulations on exec mask - combineMasks(MI); - Last = I; - continue; - default: - Last = I; - continue; + break; } - - // Replay newly inserted code to combine masks - Next = (Last == MBB.end()) ? MBB.begin() : Last; } } + for (MachineInstr *MI : Worklist) + process(*MI); + optimizeEndCf(); LoweredEndCf.clear(); LoweredIf.clear(); + NeedsKillCleanup.clear(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index a2358f3a80f4..172e6bf32721 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -61,9 +61,11 @@ loop: br label %loop } -; In case there's an epilog, we shouldn't have to do this. +; Check that the epilog is the final block ; CHECK-LABEL: return_nonvoid -; CHECK-NOT: exp null off, off, off, off done vm +; CHECK: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}: define amdgpu_ps float @return_nonvoid(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index fee3158d4296..f178259f2b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -470,7 +470,11 @@ bb9: ; preds = %bb4 } ; CHECK-LABEL: {{^}}cbranch_kill: -; CHECK-NOT: exp null off, off, off, off done vm +; CHECK: ; %bb.{{[0-9]+}}: ; %export +; CHECK-NEXT: s_or_b64 +; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: exp null off, off, off, off done vm define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) { .entry: %val0 = extractelement <2 x float> %1, i32 0