AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible

Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.

The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.

Make some other minor cleanups to the pass while we're at it.

v2: preserve the dominator tree

Reviewers: arsenm, cdevadas, foad, critson

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D74908

Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
This commit is contained in:
Nicolai Hähnle 2020-02-21 13:36:01 +01:00
parent fcfd4fb686
commit 0f1df48925
2 changed files with 162 additions and 131 deletions

View File

@ -18,9 +18,11 @@
#include "SIInstrInfo.h" #include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h" #include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstr.h"
@ -28,6 +30,7 @@
#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOperand.h"
#include "llvm/IR/CallingConv.h" #include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmInfo.h"
#include "llvm/Pass.h" #include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h" #include "llvm/Support/CommandLine.h"
@ -52,16 +55,16 @@ private:
const SIRegisterInfo *TRI = nullptr; const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr; const SIInstrInfo *TII = nullptr;
unsigned SkipThreshold = 0; unsigned SkipThreshold = 0;
MachineDominatorTree *MDT = nullptr;
bool shouldSkip(const MachineBasicBlock &From, bool shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const; const MachineBasicBlock &To) const;
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); bool dominatesAllReachable(MachineBasicBlock &MBB);
void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL);
void kill(MachineInstr &MI); bool kill(MachineInstr &MI);
MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
@ -79,6 +82,8 @@ public:
} }
void getAnalysisUsage(AnalysisUsage &AU) const override { void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU);
} }
}; };
@ -87,8 +92,11 @@ public:
char SIInsertSkips::ID = 0; char SIInsertSkips::ID = 0;
INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false) "SI insert s_cbranch_execz instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
@ -146,42 +154,73 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
return false; return false;
} }
bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { /// Check whether \p MBB dominates all blocks that are reachable from it.
MachineBasicBlock &MBB = *MI.getParent(); bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
MachineFunction *MF = MBB.getParent(); for (MachineBasicBlock *Other : depth_first(&MBB)) {
if (!MDT->dominates(&MBB, Other))
if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS || return false;
!shouldSkip(MBB, MBB.getParent()->back())) }
return false;
MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
const DebugLoc &DL = MI.getDebugLoc();
// If the exec mask is non-zero, skip the next two instructions
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addMBB(&NextBB);
MachineBasicBlock::iterator Insert = SkipBB->begin();
// Exec mask is zero: Export to NULL target...
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addImm(1) // vm
.addImm(0) // compr
.addImm(0); // en
// ... and terminate wavefront.
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
return true; return true;
} }
void SIInsertSkips::kill(MachineInstr &MI) { /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
/// iterator. Only applies to pixel shaders.
void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL) {
MachineFunction *MF = MBB.getParent();
assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
// Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last
// terminator of a basic block. If this ever changes, we need to optionally
// split MBB here.
assert(I == MBB.end());
// It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
// basic block that has no further successors (e.g., there was an
// `unreachable` there in IR). This can happen with original source of the
// form:
//
// if (uniform_condition) {
// write_to_memory();
// discard;
// }
//
// In this case, we write the "null_export; s_endpgm" skip code in the
// already-existing basic block.
auto NextBBI = std::next(MBB.getIterator());
bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
MachineBasicBlock *SkipBB;
if (NoSuccessor) {
SkipBB = &MBB;
} else {
// Create a new basic block that will contain the "null export; s_endpgm"
// and set up the branching to go around it.
SkipBB = MF->CreateMachineBasicBlock();
MF->insert(NextBBI, SkipBB);
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI);
MBB.addSuccessor(SkipBB);
MDT->addNewBlock(SkipBB, &MBB);
}
// Generate "null export; s_endpgm".
BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE))
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addImm(1) // vm
.addImm(0) // compr
.addImm(0); // en
BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
}
/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
/// Return true unless the terminator is a no-op.
bool SIInsertSkips::kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent(); MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc(); DebugLoc DL = MI.getDebugLoc();
@ -268,7 +307,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
I.addImm(0); // omod I.addImm(0); // omod
} }
break; return true;
} }
case AMDGPU::SI_KILL_I1_TERMINATOR: { case AMDGPU::SI_KILL_I1_TERMINATOR: {
const MachineFunction *MF = MI.getParent()->getParent(); const MachineFunction *MF = MI.getParent()->getParent();
@ -283,11 +322,13 @@ void SIInsertSkips::kill(MachineInstr &MI) {
int64_t Imm = Op.getImm(); int64_t Imm = Op.getImm();
assert(Imm == 0 || Imm == -1); assert(Imm == 0 || Imm == -1);
if (Imm == KillVal) if (Imm == KillVal) {
BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64), Exec) : AMDGPU::S_MOV_B64), Exec)
.addImm(0); .addImm(0);
break; return true;
}
return false;
} }
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
@ -296,27 +337,13 @@ void SIInsertSkips::kill(MachineInstr &MI) {
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
.addReg(Exec) .addReg(Exec)
.add(Op); .add(Op);
break; return true;
} }
default: default:
llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
} }
} }
MachineBasicBlock *SIInsertSkips::insertSkipBlock(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
MachineFunction *MF = MBB.getParent();
MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF->insert(MBBI, SkipBB);
MBB.addSuccessor(SkipBB);
return SkipBB;
}
// Returns true if a branch over the block was inserted. // Returns true if a branch over the block was inserted.
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) { MachineBasicBlock &SrcMBB) {
@ -430,47 +457,21 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo(); TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo(); TRI = &TII->getRegisterInfo();
MDT = &getAnalysis<MachineDominatorTree>();
SkipThreshold = SkipThresholdFlag; SkipThreshold = SkipThresholdFlag;
bool HaveKill = false; MachineBasicBlock *EmptyMBBAtEnd = nullptr;
SmallVector<MachineInstr *, 4> KillInstrs;
bool MadeChange = false; bool MadeChange = false;
// Track depth of exec mask, divergent branches. for (MachineBasicBlock &MBB : MF) {
SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
MachineFunction::iterator NextBB;
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
bool HaveSkipBlock = false;
if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
// Reached convergence point for last divergent branch.
ExecBranchStack.pop_back();
}
if (HaveKill && ExecBranchStack.empty()) {
HaveKill = false;
// TODO: Insert skip if exec is 0?
}
MachineBasicBlock::iterator I, Next; MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) { for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I); Next = std::next(I);
MachineInstr &MI = *I; MachineInstr &MI = *I;
switch (MI.getOpcode()) { switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_EXECZ:
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
break;
case AMDGPU::SI_MASK_BRANCH: case AMDGPU::SI_MASK_BRANCH:
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
MadeChange |= skipMaskBranch(MI, MBB); MadeChange |= skipMaskBranch(MI, MBB);
break; break;
@ -478,32 +479,37 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
// Optimize out branches to the next block. // Optimize out branches to the next block.
// FIXME: Shouldn't this be handled by BranchFolding? // FIXME: Shouldn't this be handled by BranchFolding?
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
assert(&MI == &MBB.back());
MI.eraseFromParent(); MI.eraseFromParent();
} else if (HaveSkipBlock) { MadeChange = true;
// Remove the given unconditional branch when a skip block has been
// inserted after the current one and let skip the two instructions
// performing the kill if the exec mask is non-zero.
MI.eraseFromParent();
} }
break; break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_I1_TERMINATOR: {
MadeChange = true; MadeChange = true;
kill(MI); bool CanKill = kill(MI);
if (ExecBranchStack.empty()) { // Check if we can add an early "if exec=0 { end shader }".
if (NextBB != BE && skipIfDead(MI, *NextBB)) { //
HaveSkipBlock = true; // Note that we _always_ do this if it is correct, even if the kill
NextBB = std::next(BI); // happens fairly late in the shader, because the null export should
BE = MF.end(); // generally still be cheaper than normal export(s).
} //
// TODO: The dominatesAllReachable check is conservative: if the
// dominance is only missing due to _uniform_ branches, we could
// in fact insert the early-exit as well.
if (CanKill &&
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
dominatesAllReachable(MBB)) {
// Mark the instruction for kill-if-dead insertion. We delay this
// change because it modifies the CFG.
KillInstrs.push_back(&MI);
} else { } else {
HaveKill = true; MI.eraseFromParent();
} }
MI.eraseFromParent();
break; break;
}
case AMDGPU::SI_RETURN_TO_EPILOG: case AMDGPU::SI_RETURN_TO_EPILOG:
// FIXME: Should move somewhere else // FIXME: Should move somewhere else
@ -511,7 +517,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
// Graphics shaders returning non-void shouldn't contain S_ENDPGM, // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
// because external bytecode will be appended at the end. // because external bytecode will be appended at the end.
if (BI != --MF.end() || I != MBB.getFirstTerminator()) { if (&MBB != &MF.back() || &MI != &MBB.back()) {
// SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
// the end and jump there. // the end and jump there.
if (!EmptyMBBAtEnd) { if (!EmptyMBBAtEnd) {
@ -520,9 +526,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
} }
MBB.addSuccessor(EmptyMBBAtEnd); MBB.addSuccessor(EmptyMBBAtEnd);
BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
.addMBB(EmptyMBBAtEnd); .addMBB(EmptyMBBAtEnd);
I->eraseFromParent(); MI.eraseFromParent();
} }
break; break;
@ -537,5 +543,12 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
} }
} }
for (MachineInstr *Kill : KillInstrs) {
skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
Kill->getDebugLoc());
Kill->eraseFromParent();
}
KillInstrs.clear();
return MadeChange; return MadeChange;
} }

View File

@ -12,7 +12,11 @@ define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: s_cbranch_execnz BB1_2
; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB1_2:
; CHECK-NEXT: s_endpgm ; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
call void @llvm.amdgcn.kill(i1 false) call void @llvm.amdgcn.kill(i1 false)
@ -23,9 +27,15 @@ define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2: ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_cbranch_execnz BB2_2
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK: BB2_2:
; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_cbranch_execnz BB2_4
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB2_4:
; CHECK-NEXT: s_endpgm ; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
call void @llvm.amdgcn.kill(i1 false) call void @llvm.amdgcn.kill(i1 false)
@ -36,7 +46,10 @@ define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-LABEL: {{^}}test_kill_depth_var:
; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_cbranch_execnz BB3_2
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB3_2:
; CHECK-NEXT: s_endpgm ; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var(float %x) #0 { define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
%cmp = fcmp olt float %x, 0.0 %cmp = fcmp olt float %x, 0.0
@ -48,9 +61,15 @@ define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_cbranch_execnz BB4_2
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB4_2:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_cbranch_execnz BB4_4
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB4_4:
; CHECK-NEXT: s_endpgm ; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
%cmp = fcmp olt float %x, 0.0 %cmp = fcmp olt float %x, 0.0
@ -59,12 +78,19 @@ define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
ret void ret void
} }
; FIXME: Ideally only one early-exit would be emitted
; CHECK-LABEL: {{^}}test_kill_depth_var_x2: ; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_cbranch_execnz BB5_2
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB5_2:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_cbranch_execnz BB5_4
; CHECK: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB5_4:
; CHECK-NEXT: s_endpgm ; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
%cmp.x = fcmp olt float %x, 0.0 %cmp.x = fcmp olt float %x, 0.0
@ -119,14 +145,12 @@ define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
; CHECK: v_nop_e64 ; CHECK: v_nop_e64
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; %bb.2:
; CHECK-NEXT: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: {{^}}[[SPLIT_BB]]: ; TODO: We could do an early-exit here (the branch above is uniform!)
; CHECK-NEXT: s_endpgm ; CHECK-NOT: exp null
define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
; CHECK: v_mov_b32_e32 v0, 1.0
define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 {
entry: entry:
%cmp = icmp eq i32 %arg, 0 %cmp = icmp eq i32 %arg, 0
br i1 %cmp, label %bb, label %exit br i1 %cmp, label %bb, label %exit
@ -149,7 +173,7 @@ bb:
br label %exit br label %exit
exit: exit:
ret void ret float 1.0
} }
; CHECK-LABEL: {{^}}test_kill_control_flow_remainder: ; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
@ -171,13 +195,10 @@ exit:
; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: v_mov_b32_e64 v8, -1
; CHECK: ;;#ASMEND ; CHECK: ;;#ASMEND
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; %bb.2: ; TODO: We could do an early-exit here (the branch above is uniform!)
; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NOT: exp null
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
; CHECK: buffer_store_dword v8 ; CHECK: buffer_store_dword v8
; CHECK: v_mov_b32_e64 v9, -2 ; CHECK: v_mov_b32_e64 v9, -2
@ -435,10 +456,7 @@ export:
; CHECK-LABEL: {{^}}complex_loop: ; CHECK-LABEL: {{^}}complex_loop:
; CHECK: s_mov_b64 exec, 0 ; CHECK: s_mov_b64 exec, 0
; The following is an error, since it happens nested inside the loop: ; CHECK-NOT: exp null
; CHECK-NEXT: s_cbranch_execnz
; CHECK-NEXT: ; %bb.{{[0-9]+}}
; CHECK-NEXT: exp null
define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
.entry: .entry:
%flaga = icmp sgt i32 %cmpa, 0 %flaga = icmp sgt i32 %cmpa, 0