AMDGPU: Fix verifier error with kill intrinsic

Don't create a terminator in the middle of the block.
We should probably get rid of this intrinsic.

llvm-svn: 275203
This commit is contained in:
Matt Arsenault 2016-07-12 19:01:23 +00:00
parent b9f8e29290
commit 657f871a4e
3 changed files with 271 additions and 69 deletions

View File

@ -76,7 +76,7 @@ private:
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
void Skip(MachineInstr &From, MachineOperand &To);
void SkipIfDead(MachineInstr &MI);
bool skipIfDead(MachineInstr &MI);
void If(MachineInstr &MI);
void Else(MachineInstr &MI, bool ExecModified);
@ -89,12 +89,16 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
void splitBlockLiveIns(const MachineBasicBlock &MBB,
const MachineInstr &MI,
MachineBasicBlock &LoopBB,
MachineBasicBlock &RemainderBB,
unsigned SaveReg,
const MachineOperand &IdxReg);
std::pair<MachineBasicBlock *, MachineBasicBlock *>
splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
const MachineRegisterInfo &MRI,
const MachineInstr &MI,
MachineBasicBlock &LoopBB,
MachineBasicBlock &RemainderBB,
unsigned SaveReg,
const MachineOperand &IdxReg);
void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
MachineInstr *MovRel,
@ -171,7 +175,19 @@ bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
return true;
if (++NumInstr >= SkipThreshold)
if (I->isInlineAsm()) {
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
const char *AsmStr = I->getOperand(0).getSymbolName();
// inlineasm length estimate is number of bytes assuming the longest
// instruction.
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
NumInstr += MaxAsmSize / MAI->getMaxInstLength();
} else {
++NumInstr;
}
if (NumInstr >= SkipThreshold)
return true;
}
}
@ -189,36 +205,55 @@ void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
.addOperand(To);
}
void SILowerControlFlow::SkipIfDead(MachineInstr &MI) {
bool SILowerControlFlow::skipIfDead(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
!shouldSkip(&MBB, &MBB.getParent()->back()))
return;
return false;
MachineBasicBlock::iterator Insert = &MI;
++Insert;
LivePhysRegs RemainderLiveRegs(TRI);
RemainderLiveRegs.addLiveOuts(MBB);
MachineBasicBlock *SkipBB;
MachineBasicBlock *RemainderBB;
std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator());
const DebugLoc &DL = MI.getDebugLoc();
// If the exec mask is non-zero, skip the next two instructions
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addImm(3);
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addMBB(RemainderBB);
MBB.addSuccessor(RemainderBB);
MachineBasicBlock::iterator Insert = SkipBB->begin();
// Exec mask is zero: Export to NULL target...
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
.addImm(0)
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addImm(0)
.addImm(1)
.addImm(1)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0);
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
.addImm(0)
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addImm(0)
.addImm(1)
.addImm(1)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef);
// ... and terminate wavefront
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
// ... and terminate wavefront.
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
for (const MachineInstr &Inst : reverse(*RemainderBB))
RemainderLiveRegs.stepBackward(Inst);
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
for (unsigned Reg : RemainderLiveRegs) {
if (MRI.isAllocatable(Reg))
RemainderBB->addLiveIn(Reg);
}
return true;
}
void SILowerControlFlow::If(MachineInstr &MI) {
@ -386,20 +421,13 @@ void SILowerControlFlow::Kill(MachineInstr &MI) {
}
// All currently live registers must remain so in the remainder block.
void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB,
const MachineInstr &MI,
MachineBasicBlock &LoopBB,
MachineBasicBlock &RemainderBB,
unsigned SaveReg,
const MachineOperand &IdxReg) {
LivePhysRegs RemainderLiveRegs(TRI);
RemainderLiveRegs.addLiveOuts(MBB);
for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI);
I != E; ++I) {
RemainderLiveRegs.stepBackward(*I);
}
void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
const MachineRegisterInfo &MRI,
const MachineInstr &MI,
MachineBasicBlock &LoopBB,
MachineBasicBlock &RemainderBB,
unsigned SaveReg,
const MachineOperand &IdxReg) {
// Add reg defined in loop body.
RemainderLiveRegs.addReg(SaveReg);
@ -410,13 +438,11 @@ void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB,
}
}
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
for (unsigned Reg : RemainderLiveRegs) {
if (MRI.isAllocatable(Reg))
RemainderBB.addLiveIn(Reg);
}
const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
if (!Src->isUndef())
LoopBB.addLiveIn(Src->getReg());
@ -469,6 +495,30 @@ void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
.addMBB(&LoopBB);
}
std::pair<MachineBasicBlock *, MachineBasicBlock *>
SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) {
MachineFunction *MF = MBB.getParent();
// To insert the loop we need to split the block. Move everything after this
// point to a new block, and insert a new empty block between the two.
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF->insert(MBBI, LoopBB);
MF->insert(MBBI, RemainderBB);
// Move the rest of the block into a new block.
RemainderBB->transferSuccessors(&MBB);
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
MBB.addSuccessor(LoopBB);
return std::make_pair(LoopBB, RemainderBB);
}
// Returns true if a new block was inserted.
bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
MachineBasicBlock &MBB = *MI.getParent();
@ -492,7 +542,6 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs
return false;
}
MachineFunction &MF = *MBB.getParent();
MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
SaveOp->setIsDead(false);
unsigned Save = SaveOp->getReg();
@ -505,25 +554,24 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
.addReg(AMDGPU::EXEC);
// To insert the loop we need to split the block. Move everything after this
// point to a new block, and insert a new empty block between the two.
MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
LivePhysRegs RemainderLiveRegs(TRI);
MF.insert(MBBI, LoopBB);
MF.insert(MBBI, RemainderBB);
RemainderLiveRegs.addLiveOuts(MBB);
LoopBB->addSuccessor(LoopBB);
MachineBasicBlock *LoopBB;
MachineBasicBlock *RemainderBB;
std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
for (const MachineInstr &Inst : reverse(*RemainderBB))
RemainderLiveRegs.stepBackward(Inst);
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
LoopBB->addSuccessor(RemainderBB);
LoopBB->addSuccessor(LoopBB);
splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx);
// Move the rest of the block into a new block.
RemainderBB->transferSuccessors(&MBB);
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
MBB.addSuccessor(LoopBB);
splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
*RemainderBB, Save, *Idx);
emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
@ -695,16 +743,25 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SI_END_CF:
if (--Depth == 0 && HaveKill) {
SkipIfDead(MI);
HaveKill = false;
if (skipIfDead(MI)) {
NextBB = std::next(BI);
BE = MF.end();
Next = MBB.end();
}
}
EndCf(MI);
break;
case AMDGPU::SI_KILL:
if (Depth == 0)
SkipIfDead(MI);
else
if (Depth == 0) {
if (skipIfDead(MI)) {
NextBB = std::next(BI);
BE = MF.end();
Next = MBB.end();
}
} else
HaveKill = true;
Kill(MI);
break;

View File

@ -3,7 +3,7 @@
# CHECK-LABEL: name: extract_undef_offset_vgpr{{$}}
# CHECK: bb.1:
# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%)
# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
# CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
@ -103,7 +103,7 @@ body: |
# CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}}
# CHECK: bb.1:
# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%)
# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
@ -159,7 +159,7 @@ body: |
# CHECK-LABEL: name: insert_undef_offset_vgpr{{$}}
# CHECK: bb.1:
# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%)
# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
@ -215,7 +215,7 @@ body: |
# CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}}
# CHECK: bb.1:
# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%)
# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
@ -272,7 +272,7 @@ body: |
# CHECK-LABEL: insert_undef_value_offset_vgpr{{$}}
# CHECK: bb.1:
# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%)
# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
# CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec

View File

@ -0,0 +1,145 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
; CHECK-NEXT: ; BB#0:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
call void @llvm.AMDGPU.kill(float 0.0)
ret void
}
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
; CHECK-NEXT: ; BB#0:
; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
call void @llvm.AMDGPU.kill(float -0.0)
ret void
}
; CHECK-LABEL: {{^}}test_kill_depth_var:
; CHECK-NEXT: ; BB#0:
; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
call void @llvm.AMDGPU.kill(float %x)
ret void
}
; FIXME: why does the skip depend on the asm length in the same block?
; CHECK-LABEL: {{^}}test_kill_control_flow:
; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0
; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
; CHECK: ; BB#1:
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; BB#3:
; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7
; CHECK-NEXT: {{^}}BB{{[0-9]+_[0-9]+}}:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
entry:
%cmp = icmp eq i32 %arg, 0
br i1 %cmp, label %bb, label %exit
bb:
%var = call float asm sideeffect "
v_mov_b32_e64 v7, -1
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64", "={VGPR7}"()
call void @llvm.AMDGPU.kill(float %var)
br label %exit
exit:
ret void
}
; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0
; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; BB#1: ; %bb
; CHECK: v_mov_b32_e64 v7, -1
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: v_nop_e64
; CHECK: ;;#ASMEND
; CHECK: v_mov_b32_e64 v8, -1
; CHECK: ;;#ASMEND
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; BB#3:
; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7
; CHECK: buffer_store_dword v8
; CHECK: v_mov_b32_e64 v9, -2
; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}:
; CHECK: buffer_store_dword v9
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
entry:
%cmp = icmp eq i32 %arg, 0
br i1 %cmp, label %bb, label %exit
bb:
%var = call float asm sideeffect "
v_mov_b32_e64 v7, -1
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64", "={VGPR7}"()
%live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"()
call void @llvm.AMDGPU.kill(float %var)
store volatile float %live.across, float addrspace(1)* undef
%live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"()
br label %exit
exit:
%phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
store float %phi, float addrspace(1)* undef
ret void
}
declare void @llvm.AMDGPU.kill(float) #0
attributes #0 = { nounwind }