forked from OSchip/llvm-project
[AMDGPU] Apply pre-emit s_cbranch_vcc optimation to more patterns
Add handling of s_andn2 and mask of 0. This eliminates redundant instructions from uniform control flow. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D83641
This commit is contained in:
parent
233af8958e
commit
674226126d
|
@ -54,14 +54,14 @@ char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
|
||||||
|
|
||||||
bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
||||||
// Match:
|
// Match:
|
||||||
// sreg = -1
|
// sreg = -1 or 0
|
||||||
// vcc = S_AND_B64 exec, sreg
|
// vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
|
||||||
// S_CBRANCH_VCC[N]Z
|
// S_CBRANCH_VCC[N]Z
|
||||||
// =>
|
// =>
|
||||||
// S_CBRANCH_EXEC[N]Z
|
// S_CBRANCH_EXEC[N]Z
|
||||||
// We end up with this pattern sometimes after basic block placement.
|
// We end up with this pattern sometimes after basic block placement.
|
||||||
// It happens while combining a block which assigns -1 to a saved mask and
|
// It happens while combining a block which assigns -1 or 0 to a saved mask
|
||||||
// another block which consumes that saved mask and then a branch.
|
// and another block which consumes that saved mask and then a branch.
|
||||||
bool Changed = false;
|
bool Changed = false;
|
||||||
MachineBasicBlock &MBB = *MI.getParent();
|
MachineBasicBlock &MBB = *MI.getParent();
|
||||||
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
||||||
|
@ -69,6 +69,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
||||||
const unsigned CondReg = TRI->getVCC();
|
const unsigned CondReg = TRI->getVCC();
|
||||||
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||||
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
||||||
|
const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
|
||||||
|
|
||||||
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
|
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
|
||||||
E = MBB.rend();
|
E = MBB.rend();
|
||||||
|
@ -80,7 +81,8 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
||||||
if (A->modifiesRegister(ExecReg, TRI))
|
if (A->modifiesRegister(ExecReg, TRI))
|
||||||
return false;
|
return false;
|
||||||
if (A->modifiesRegister(CondReg, TRI)) {
|
if (A->modifiesRegister(CondReg, TRI)) {
|
||||||
if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
|
if (!A->definesRegister(CondReg, TRI) ||
|
||||||
|
(A->getOpcode() != And && A->getOpcode() != AndN2))
|
||||||
return false;
|
return false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -97,9 +99,10 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
||||||
}
|
}
|
||||||
if (Op1.getReg() != ExecReg)
|
if (Op1.getReg() != ExecReg)
|
||||||
return Changed;
|
return Changed;
|
||||||
if (Op2.isImm() && Op2.getImm() != -1)
|
if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
|
||||||
return Changed;
|
return Changed;
|
||||||
|
|
||||||
|
int64_t MaskValue = 0;
|
||||||
Register SReg;
|
Register SReg;
|
||||||
if (Op2.isReg()) {
|
if (Op2.isReg()) {
|
||||||
SReg = Op2.getReg();
|
SReg = Op2.getReg();
|
||||||
|
@ -113,28 +116,75 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
||||||
ReadsSreg |= M->readsRegister(SReg, TRI);
|
ReadsSreg |= M->readsRegister(SReg, TRI);
|
||||||
}
|
}
|
||||||
if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
|
if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
|
||||||
M->getOperand(1).getImm() != -1)
|
(M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
|
||||||
return Changed;
|
return Changed;
|
||||||
// First if sreg is only used in and instruction fold the immediate
|
MaskValue = M->getOperand(1).getImm();
|
||||||
// into that and.
|
// First if sreg is only used in the AND instruction fold the immediate
|
||||||
|
// into into the AND.
|
||||||
if (!ReadsSreg && Op2.isKill()) {
|
if (!ReadsSreg && Op2.isKill()) {
|
||||||
A->getOperand(2).ChangeToImmediate(-1);
|
A->getOperand(2).ChangeToImmediate(MaskValue);
|
||||||
M->eraseFromParent();
|
M->eraseFromParent();
|
||||||
}
|
}
|
||||||
|
} else if (Op2.isImm()) {
|
||||||
|
MaskValue = Op2.getImm();
|
||||||
|
} else {
|
||||||
|
llvm_unreachable("Op2 must be register or immediate");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Invert mask for s_andn2
|
||||||
|
assert(MaskValue == 0 || MaskValue == -1);
|
||||||
|
if (A->getOpcode() == AndN2)
|
||||||
|
MaskValue = ~MaskValue;
|
||||||
|
|
||||||
if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
|
if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
|
||||||
MI.killsRegister(CondReg, TRI))
|
MI.killsRegister(CondReg, TRI))
|
||||||
A->eraseFromParent();
|
A->eraseFromParent();
|
||||||
|
|
||||||
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
|
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
|
||||||
if (SReg == ExecReg) {
|
if (SReg == ExecReg) {
|
||||||
|
// EXEC is updated directly
|
||||||
if (IsVCCZ) {
|
if (IsVCCZ) {
|
||||||
MI.eraseFromParent();
|
MI.eraseFromParent();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
||||||
} else {
|
} else if (IsVCCZ && MaskValue == 0) {
|
||||||
|
// Will always branch
|
||||||
|
// Remove all succesors shadowed by new unconditional branch
|
||||||
|
MachineBasicBlock *Parent = MI.getParent();
|
||||||
|
SmallVector<MachineInstr *, 4> ToRemove;
|
||||||
|
bool Found = false;
|
||||||
|
for (MachineInstr &Term : Parent->terminators()) {
|
||||||
|
if (Found) {
|
||||||
|
if (Term.isBranch())
|
||||||
|
ToRemove.push_back(&Term);
|
||||||
|
} else {
|
||||||
|
Found = Term.isIdenticalTo(MI);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(Found && "conditional branch is not terminator");
|
||||||
|
for (auto BranchMI : ToRemove) {
|
||||||
|
MachineOperand &Dst = BranchMI->getOperand(0);
|
||||||
|
assert(Dst.isMBB() && "destination is not basic block");
|
||||||
|
Parent->removeSuccessor(Dst.getMBB());
|
||||||
|
BranchMI->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
|
||||||
|
Parent->removeSuccessor(Succ);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rewrite to unconditional branch
|
||||||
|
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
||||||
|
} else if (!IsVCCZ && MaskValue == 0) {
|
||||||
|
// Will never branch
|
||||||
|
MachineOperand &Dst = MI.getOperand(0);
|
||||||
|
assert(Dst.isMBB() && "destination is not basic block");
|
||||||
|
MI.getParent()->removeSuccessor(Dst.getMBB());
|
||||||
|
MI.eraseFromParent();
|
||||||
|
return true;
|
||||||
|
} else if (MaskValue == -1) {
|
||||||
|
// Depends only on EXEC
|
||||||
MI.setDesc(
|
MI.setDesc(
|
||||||
TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
|
TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
|
||||||
}
|
}
|
||||||
|
|
|
@ -482,13 +482,10 @@ ret:
|
||||||
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
|
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
|
||||||
; GCN-NEXT: s_addc_u32
|
; GCN-NEXT: s_addc_u32
|
||||||
; GCN-NEXT: s_setpc_b64
|
; GCN-NEXT: s_setpc_b64
|
||||||
|
|
||||||
; GCN-NEXT: [[LONG_BR_0]]:
|
; GCN-NEXT: [[LONG_BR_0]]:
|
||||||
; GCN: s_setpc_b64
|
|
||||||
|
|
||||||
; GCN: [[LONG_BR_DEST0]]
|
; GCN: [[LONG_BR_DEST0]]:
|
||||||
|
|
||||||
; GCN: s_cbranch_vccnz
|
|
||||||
; GCN-DAG: v_cmp_lt_i32
|
; GCN-DAG: v_cmp_lt_i32
|
||||||
; GCN-DAG: v_cmp_ge_i32
|
; GCN-DAG: v_cmp_ge_i32
|
||||||
|
|
||||||
|
|
|
@ -524,7 +524,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(
|
||||||
|
|
||||||
; GCN: {{^; %bb.[0-9]}}:
|
; GCN: {{^; %bb.[0-9]}}:
|
||||||
; GCN: s_mov_b64 exec,
|
; GCN: s_mov_b64 exec,
|
||||||
; GCN: s_cbranch_vccnz [[BB2]]
|
; GCN: s_cbranch_execnz [[BB2]]
|
||||||
|
|
||||||
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
|
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
|
||||||
bb:
|
bb:
|
||||||
|
|
|
@ -159,7 +159,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
|
||||||
; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||||
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||||
; SI-NEXT: s_and_b64 vcc, exec, 0
|
; SI-NEXT: s_and_b64 vcc, exec, 0
|
||||||
; SI-NEXT: s_cbranch_vccz BB3_2
|
; SI-NEXT: s_branch BB3_2
|
||||||
; SI-NEXT: BB3_5: ; %UnifiedReturnBlock
|
; SI-NEXT: BB3_5: ; %UnifiedReturnBlock
|
||||||
; SI-NEXT: s_endpgm
|
; SI-NEXT: s_endpgm
|
||||||
; IR-LABEL: @infinite_loop_nest_ret(
|
; IR-LABEL: @infinite_loop_nest_ret(
|
||||||
|
|
|
@ -338,3 +338,80 @@ body: |
|
||||||
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
|
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
|
||||||
S_ENDPGM 0
|
S_ENDPGM 0
|
||||||
...
|
...
|
||||||
|
---
|
||||||
|
# GCN-LABEL: name: andn2_execz_mov_vccz
|
||||||
|
# GCN-NOT: S_MOV_
|
||||||
|
# GCN-NOT: S_ANDN2_
|
||||||
|
# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
|
||||||
|
name: andn2_execz_mov_vccz
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.1:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.2:
|
||||||
|
$sgpr0_sgpr1 = S_MOV_B64 0
|
||||||
|
$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
|
||||||
|
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
|
||||||
|
S_ENDPGM 0
|
||||||
|
...
|
||||||
|
---
|
||||||
|
# GCN-LABEL: name: andn2_branch_mov_vccz
|
||||||
|
# GCN-NOT: S_MOV_
|
||||||
|
# GCN-NOT: S_ANDN2_
|
||||||
|
# GCN: S_BRANCH %bb.1
|
||||||
|
name: andn2_branch_mov_vccz
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.1:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.2:
|
||||||
|
$sgpr0_sgpr1 = S_MOV_B64 -1
|
||||||
|
$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
|
||||||
|
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
|
||||||
|
S_ENDPGM 0
|
||||||
|
...
|
||||||
|
---
|
||||||
|
# GCN-LABEL: name: andn2_execnz_mov_vccnz
|
||||||
|
# GCN-NOT: S_MOV_
|
||||||
|
# GCN-NOT: S_ANDN2_
|
||||||
|
# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
||||||
|
name: andn2_execnz_mov_vccnz
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.1:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.2:
|
||||||
|
$sgpr0_sgpr1 = S_MOV_B64 0
|
||||||
|
$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
|
||||||
|
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||||
|
S_ENDPGM 0
|
||||||
|
...
|
||||||
|
---
|
||||||
|
# GCN-LABEL: name: andn2_no_branch_mov_vccnz
|
||||||
|
# GCN-NOT: S_MOV_
|
||||||
|
# GCN-NOT: S_ANDN2_
|
||||||
|
# GCN-NOT: S_CBRANCH
|
||||||
|
# GCN-NOT: S_BRANCH
|
||||||
|
name: andn2_no_branch_mov_vccnz
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.1:
|
||||||
|
S_NOP 0
|
||||||
|
|
||||||
|
bb.2:
|
||||||
|
$sgpr0_sgpr1 = S_MOV_B64 -1
|
||||||
|
$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
|
||||||
|
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||||
|
S_ENDPGM 0
|
||||||
|
...
|
||||||
|
|
|
@ -1327,9 +1327,6 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
|
||||||
; SI-NEXT: s_cbranch_vccz BB26_3
|
; SI-NEXT: s_cbranch_vccz BB26_3
|
||||||
; SI-NEXT: s_branch BB26_4
|
; SI-NEXT: s_branch BB26_4
|
||||||
; SI-NEXT: BB26_2:
|
; SI-NEXT: BB26_2:
|
||||||
; SI-NEXT: s_mov_b64 s[2:3], -1
|
|
||||||
; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
||||||
; SI-NEXT: s_cbranch_vccnz BB26_4
|
|
||||||
; SI-NEXT: BB26_3: ; %if
|
; SI-NEXT: BB26_3: ; %if
|
||||||
; SI-NEXT: s_load_dword s1, s[6:7], 0x0
|
; SI-NEXT: s_load_dword s1, s[6:7], 0x0
|
||||||
; SI-NEXT: BB26_4: ; %endif
|
; SI-NEXT: BB26_4: ; %endif
|
||||||
|
@ -1350,14 +1347,9 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
|
||||||
; VI-NEXT: s_cbranch_scc0 BB26_2
|
; VI-NEXT: s_cbranch_scc0 BB26_2
|
||||||
; VI-NEXT: ; %bb.1: ; %else
|
; VI-NEXT: ; %bb.1: ; %else
|
||||||
; VI-NEXT: s_load_dword s1, s[6:7], 0x4
|
; VI-NEXT: s_load_dword s1, s[6:7], 0x4
|
||||||
; VI-NEXT: s_mov_b64 s[2:3], 0
|
; VI-NEXT: s_cbranch_execz BB26_3
|
||||||
; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
||||||
; VI-NEXT: s_cbranch_vccz BB26_3
|
|
||||||
; VI-NEXT: s_branch BB26_4
|
; VI-NEXT: s_branch BB26_4
|
||||||
; VI-NEXT: BB26_2:
|
; VI-NEXT: BB26_2:
|
||||||
; VI-NEXT: s_mov_b64 s[2:3], -1
|
|
||||||
; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
||||||
; VI-NEXT: s_cbranch_vccnz BB26_4
|
|
||||||
; VI-NEXT: BB26_3: ; %if
|
; VI-NEXT: BB26_3: ; %if
|
||||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||||
; VI-NEXT: s_load_dword s1, s[6:7], 0x0
|
; VI-NEXT: s_load_dword s1, s[6:7], 0x0
|
||||||
|
|
|
@ -367,7 +367,6 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
|
||||||
; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
|
; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
|
||||||
|
|
||||||
; GCN: {{^}}[[FLOW]]:
|
; GCN: {{^}}[[FLOW]]:
|
||||||
; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
|
|
||||||
|
|
||||||
; GCN: s_or_b64 exec, exec
|
; GCN: s_or_b64 exec, exec
|
||||||
; GCN: v_mov_b32_e32 v0, 2.0
|
; GCN: v_mov_b32_e32 v0, 2.0
|
||||||
|
|
|
@ -19,15 +19,10 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a,
|
||||||
; SI-NEXT: s_cbranch_scc0 BB0_2
|
; SI-NEXT: s_cbranch_scc0 BB0_2
|
||||||
; SI-NEXT: ; %bb.1: ; %else
|
; SI-NEXT: ; %bb.1: ; %else
|
||||||
; SI-NEXT: s_add_i32 s2, s7, s2
|
; SI-NEXT: s_add_i32 s2, s7, s2
|
||||||
; SI-NEXT: s_mov_b64 s[8:9], 0
|
; SI-NEXT: s_cbranch_execz BB0_3
|
||||||
; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
|
|
||||||
; SI-NEXT: s_cbranch_vccz BB0_3
|
|
||||||
; SI-NEXT: s_branch BB0_4
|
; SI-NEXT: s_branch BB0_4
|
||||||
; SI-NEXT: BB0_2:
|
; SI-NEXT: BB0_2:
|
||||||
; SI-NEXT: s_mov_b64 s[8:9], -1
|
|
||||||
; SI-NEXT: ; implicit-def: $sgpr2
|
; SI-NEXT: ; implicit-def: $sgpr2
|
||||||
; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
|
|
||||||
; SI-NEXT: s_cbranch_vccnz BB0_4
|
|
||||||
; SI-NEXT: BB0_3: ; %if
|
; SI-NEXT: BB0_3: ; %if
|
||||||
; SI-NEXT: s_sub_i32 s2, s5, s6
|
; SI-NEXT: s_sub_i32 s2, s5, s6
|
||||||
; SI-NEXT: BB0_4: ; %endif
|
; SI-NEXT: BB0_4: ; %endif
|
||||||
|
@ -69,15 +64,10 @@ define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x
|
||||||
; SI-NEXT: s_load_dword s6, s[0:1], 0x37
|
; SI-NEXT: s_load_dword s6, s[0:1], 0x37
|
||||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||||
; SI-NEXT: s_add_i32 s3, s3, s6
|
; SI-NEXT: s_add_i32 s3, s3, s6
|
||||||
; SI-NEXT: s_mov_b64 s[6:7], 0
|
; SI-NEXT: s_cbranch_execz BB1_3
|
||||||
; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
||||||
; SI-NEXT: s_cbranch_vccz BB1_3
|
|
||||||
; SI-NEXT: s_branch BB1_4
|
; SI-NEXT: s_branch BB1_4
|
||||||
; SI-NEXT: BB1_2:
|
; SI-NEXT: BB1_2:
|
||||||
; SI-NEXT: s_mov_b64 s[6:7], -1
|
|
||||||
; SI-NEXT: ; implicit-def: $sgpr3
|
; SI-NEXT: ; implicit-def: $sgpr3
|
||||||
; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
||||||
; SI-NEXT: s_cbranch_vccnz BB1_4
|
|
||||||
; SI-NEXT: BB1_3: ; %if
|
; SI-NEXT: BB1_3: ; %if
|
||||||
; SI-NEXT: s_load_dword s3, s[0:1], 0x1c
|
; SI-NEXT: s_load_dword s3, s[0:1], 0x1c
|
||||||
; SI-NEXT: s_load_dword s0, s[0:1], 0x25
|
; SI-NEXT: s_load_dword s0, s[0:1], 0x25
|
||||||
|
|
|
@ -668,7 +668,7 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d
|
||||||
; GCN-LABEL: {{^}}test_loop_vcc:
|
; GCN-LABEL: {{^}}test_loop_vcc:
|
||||||
; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
|
; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
|
||||||
; GFX1064: v_cmp_lt_f32_e32 vcc,
|
; GFX1064: v_cmp_lt_f32_e32 vcc,
|
||||||
; GCN: s_cbranch_vccnz
|
; GCN: s_cbranch_vccz
|
||||||
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
|
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
|
||||||
entry:
|
entry:
|
||||||
br label %loop
|
br label %loop
|
||||||
|
|
|
@ -652,13 +652,11 @@ main_body:
|
||||||
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
|
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
|
||||||
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
|
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
|
||||||
|
|
||||||
; CHECK: ; %body
|
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
|
||||||
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
|
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
|
||||||
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop
|
; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
|
||||||
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
|
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
|
||||||
; CHECK: s_cbranch_vccz
|
; CHECK: s_cbranch_vccz [[LOOPHDR]]
|
||||||
|
|
||||||
; CHECK: s_cbranch_vccnz [[LOOPHDR]]
|
|
||||||
|
|
||||||
; CHECK: ; %break
|
; CHECK: ; %break
|
||||||
; CHECK: ; return
|
; CHECK: ; return
|
||||||
|
|
Loading…
Reference in New Issue