[AMDGPU] Extend pre-emit peephole to redundantly masked VCC

Extend pre-emit peephole for S_CBRANCH_VCC[N]Z to eliminate
redundant S_AND operations against EXEC for V_CMP results in VCC.
These occur after after register allocation when VCC has been
selected as the comparison destination.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D120202
This commit is contained in:
Carl Ritson 2022-02-25 09:42:55 +09:00
parent 79787b903d
commit 565af157ef
19 changed files with 139 additions and 60 deletions

View File

@ -74,6 +74,15 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
// We end up with this pattern sometimes after basic block placement.
// It happens while combining a block which assigns -1 or 0 to a saved mask
// and another block which consumes that saved mask and then a branch.
//
// While searching this also performs the following substitution:
// vcc = V_CMP
// vcc = S_AND exec, vcc
// S_CBRANCH_VCC[N]Z
// =>
// vcc = V_CMP
// S_CBRANCH_VCC[N]Z
bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
@ -121,14 +130,27 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
SReg = Op2.getReg();
auto M = std::next(A);
bool ReadsSreg = false;
bool ModifiesExec = false;
for (; M != E; ++M) {
if (M->definesRegister(SReg, TRI))
break;
if (M->modifiesRegister(SReg, TRI))
return Changed;
ReadsSreg |= M->readsRegister(SReg, TRI);
ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
}
if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
if (M == E)
return Changed;
// If SReg is VCC and SReg definition is a VALU comparison.
// This means S_AND with EXEC is not required.
// Erase the S_AND and return.
// Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
TII->isVOPC(*M) && TII->isVALU(*M)) {
A->eraseFromParent();
return true;
}
if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
(M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
return Changed;
MaskValue = M->getOperand(1).getImm();

View File

@ -623,7 +623,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13]
; GFX908-NEXT: v_add_co_u32_e64 v14, s[2:3], v14, v6
; GFX908-NEXT: v_addc_co_u32_e64 v15, s[2:3], v15, v7, s[2:3]
; GFX908-NEXT: s_and_b64 vcc, exec, vcc
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: .LBB3_5: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
@ -751,7 +750,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, v16, v10
; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v11, vcc
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15]
; GFX90A-NEXT: s_and_b64 vcc, exec, vcc
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: .LBB3_5: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1

View File

@ -82,7 +82,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
; GFX7-NEXT: s_and_b64 vcc, exec, vcc
; GFX7-NEXT: s_cbranch_vccnz .LBB1_2
; GFX7-NEXT: ; %bb.1: ; %bb0
; GFX7-NEXT: v_mov_b32_e32 v0, 0
@ -109,7 +108,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: s_addc_u32 s0, s2, 0
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
; GFX9-NEXT: ; %bb.1: ; %bb0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@ -136,7 +134,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_cmpk_lg_u32 s1, 0x0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: s_cbranch_vccnz .LBB1_2
; GFX10-NEXT: ; %bb.1: ; %bb0
; GFX10-NEXT: v_mov_b32_e32 v0, 0

View File

@ -71,7 +71,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(i16 addrspace(1
; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz .LBB1_1
; GCN-NEXT: ; %bb.2: ; %bb2
; GCN-NEXT: s_endpgm

View File

@ -431,7 +431,6 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: global_store_short v[5:6], v0, off
; GFX9-NEXT: s_cbranch_vccz .LBB4_1
@ -516,7 +515,6 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8
; GFX9-NEXT: global_store_short v[5:6], v0, off
@ -552,7 +550,6 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7
; GFX10-NEXT: global_store_short v[5:6], v0, off
; GFX10-NEXT: s_cbranch_vccz .LBB5_1
@ -608,7 +605,6 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v2|
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1]
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_add_u32_e32 v0, v8, v0
; GFX9-NEXT: global_store_short v[5:6], v0, off
; GFX9-NEXT: s_cbranch_vccz .LBB6_1
@ -701,7 +697,6 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: v_mov_b32_e32 v8, s5
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0
; GFX9-NEXT: global_store_short v[5:6], v0, off
@ -741,7 +736,6 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, s1
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v7, v0
; GFX10-NEXT: global_store_short v[5:6], v0, off
; GFX10-NEXT: s_cbranch_vccz .LBB7_1

View File

@ -535,3 +535,119 @@ body: |
S_CBRANCH_VCCZ %bb.1, implicit $vcc
S_ENDPGM 0
...
---
# GCN-LABEL: name: and_cmp_vccz
# GCN: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
# GCN-NOT: S_AND_
# GCN: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
name: and_cmp_vccz
body: |
bb.0:
S_NOP 0
bb.1:
S_NOP 0
bb.2:
V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
$vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
S_ENDPGM 0
...
---
# GCN-LABEL: name: and_cmp_vccnz
# GCN: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
# GCN-NOT: S_AND_
# GCN: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
name: and_cmp_vccnz
body: |
bb.0:
S_NOP 0
bb.1:
S_NOP 0
bb.2:
V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
$vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
S_ENDPGM 0
...
---
# GCN-LABEL: name: andn2_cmp_vccz
# GCN: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
# GCN: $vcc = S_ANDN2_B64 $exec, $vcc, implicit-def dead $scc
# GCN: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
name: andn2_cmp_vccz
body: |
bb.0:
S_NOP 0
bb.1:
S_NOP 0
bb.2:
V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
$vcc = S_ANDN2_B64 $exec, $vcc, implicit-def dead $scc
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
S_ENDPGM 0
...
---
# GCN-LABEL: name: and_cmpclass_vccz
# GCN: V_CMP_CLASS_F32_e32 killed $sgpr0, killed $vgpr0, implicit-def $vcc, implicit $exec
# GCN-NOT: S_AND_
# GCN: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
name: and_cmpclass_vccz
body: |
bb.0:
S_NOP 0
bb.1:
S_NOP 0
bb.2:
V_CMP_CLASS_F32_e32 killed $sgpr0, killed $vgpr0, implicit-def $vcc, implicit $exec
$vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
S_ENDPGM 0
...
---
# GCN-LABEL: name: and_cmpx_vccz
# GCN: V_CMPX_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit-def $exec, implicit $exec
# GCN-NOT: S_AND_
# GCN: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
name: and_cmpx_vccz
body: |
bb.0:
S_NOP 0
bb.1:
S_NOP 0
bb.2:
V_CMPX_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit-def $exec, implicit $exec
$vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
S_ENDPGM 0
...
---
# GCN-LABEL: name: and_or_cmp_vccz
# GCN: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
# GCN: $exec = S_OR_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc
# GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
# GCN: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
name: and_or_cmp_vccz
body: |
bb.0:
S_NOP 0
bb.1:
S_NOP 0
bb.2:
V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
$exec = S_OR_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc
$vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
S_ENDPGM 0
...

View File

@ -195,7 +195,6 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 s[6:7], -1
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9
; GCN-NEXT: s_mov_b64 s[10:11], -1
; GCN-NEXT: s_cbranch_vccnz .LBB1_6
@ -203,7 +202,6 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_mov_b64 s[6:7], -1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_mov_b64 s[8:9], -1
; GCN-NEXT: s_cbranch_vccz .LBB1_5
; GCN-NEXT: ; %bb.4: ; %case1
@ -223,7 +221,6 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: ; %bb.7: ; %LeafBlock
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_mov_b64 s[8:9], -1
; GCN-NEXT: s_cbranch_vccz .LBB1_1
; GCN-NEXT: ; %bb.8: ; %case0

View File

@ -151,7 +151,6 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 8, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB1_6
; GCN-NEXT: ; %bb.1: ; %bb14.lr.ph
; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
@ -176,7 +175,6 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 8, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB1_4
; GCN-NEXT: ; %bb.5: ; %bb21
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1

View File

@ -438,7 +438,6 @@ entry:
; {{^}}sopc_vopc_legalize_bug:
; GCN: s_load_dword [[SGPR:s[0-9]+]]
; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
; GCN: s_and_b64 vcc, exec, vcc
; GCN: s_cbranch_vccnz [[EXIT:.L[A-Z0-9_]+]]
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
; GCN-NOHSA: buffer_store_dword [[ONE]]

View File

@ -217,7 +217,6 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow6
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[12:13], 1
@ -1071,7 +1070,6 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
; GCN-IR-NEXT: .LBB9_4: ; %Flow3
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[12:13], 1
@ -1283,7 +1281,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3
; GCN-IR-NEXT: .LBB10_4: ; %Flow5
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1

View File

@ -750,7 +750,6 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: s_and_b64 vcc, exec, vcc
; SI-NEXT: s_cbranch_vccnz .LBB10_2
; SI-NEXT: .LBB10_4: ; %Flow1
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
@ -796,7 +795,6 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
; GFX10-WAVE64-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, vcc
; GFX10-WAVE64-NEXT: s_cbranch_vccnz .LBB10_1
; GFX10-WAVE64-NEXT: .LBB10_3: ; %Flow1
; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3]
@ -840,7 +838,6 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
; GFX10-WAVE32-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-WAVE32-NEXT: s_cbranch_vccnz .LBB10_1
; GFX10-WAVE32-NEXT: .LBB10_3: ; %Flow1
; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1
@ -901,7 +898,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; SI-NEXT: v_mov_b32_e32 v0, 4.0
; SI-NEXT: .LBB11_3: ; %phibb
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; SI-NEXT: s_and_b64 vcc, exec, vcc
; SI-NEXT: s_cbranch_vccz .LBB11_5
; SI-NEXT: ; %bb.4: ; %bb10
; SI-NEXT: s_mov_b32 s3, 0xf000
@ -934,7 +930,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; GFX10-WAVE64-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WAVE64-NEXT: .LBB11_3: ; %phibb
; GFX10-WAVE64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, vcc
; GFX10-WAVE64-NEXT: s_cbranch_vccz .LBB11_5
; GFX10-WAVE64-NEXT: ; %bb.4: ; %bb10
; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 9
@ -965,7 +960,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; GFX10-WAVE32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WAVE32-NEXT: .LBB11_3: ; %phibb
; GFX10-WAVE32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-WAVE32-NEXT: s_cbranch_vccz .LBB11_5
; GFX10-WAVE32-NEXT: ; %bb.4: ; %bb10
; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 9

View File

@ -189,7 +189,6 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow6
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[10:11], 1
@ -1081,7 +1080,6 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3
; GCN-IR-NEXT: .LBB8_4: ; %Flow6
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], 1
@ -1243,7 +1241,6 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
; GCN-IR-NEXT: .LBB9_4: ; %Flow3
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], 1
@ -1457,7 +1454,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-IR-NEXT: s_addc_u32 s7, s7, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3
; GCN-IR-NEXT: .LBB10_4: ; %Flow5
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1

View File

@ -363,7 +363,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; NOHSA-TRAP-GFX900-V2-NEXT: global_load_dword v1, v0, s[0:1] glc
; NOHSA-TRAP-GFX900-V2-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-V2-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; NOHSA-TRAP-GFX900-V2-NEXT: s_and_b64 vcc, exec, vcc
; NOHSA-TRAP-GFX900-V2-NEXT: s_cbranch_vccz .LBB1_2
; NOHSA-TRAP-GFX900-V2-NEXT: ; %bb.1: ; %ret
; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 3
@ -381,7 +380,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; NOHSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc
; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; NOHSA-TRAP-GFX900-V3-NEXT: s_and_b64 vcc, exec, vcc
; NOHSA-TRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2
; NOHSA-TRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret
; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3
@ -399,7 +397,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; NOHSA-TRAP-GFX900-V4-NEXT: s_and_b64 vcc, exec, vcc
; NOHSA-TRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2
; NOHSA-TRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3
@ -486,7 +483,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-TRAP-GFX803-V2-NEXT: flat_load_dword v0, v[0:1] glc
; HSA-TRAP-GFX803-V2-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-V2-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
; HSA-TRAP-GFX803-V2-NEXT: s_and_b64 vcc, exec, vcc
; HSA-TRAP-GFX803-V2-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX803-V2-NEXT: ; %bb.1: ; %ret
; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v0, s0
@ -508,7 +504,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-TRAP-GFX803-V3-NEXT: flat_load_dword v0, v[0:1] glc
; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
; HSA-TRAP-GFX803-V3-NEXT: s_and_b64 vcc, exec, vcc
; HSA-TRAP-GFX803-V3-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX803-V3-NEXT: ; %bb.1: ; %ret
; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0
@ -530,7 +525,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-TRAP-GFX803-V4-NEXT: flat_load_dword v0, v[0:1] glc
; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
; HSA-TRAP-GFX803-V4-NEXT: s_and_b64 vcc, exec, vcc
; HSA-TRAP-GFX803-V4-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX803-V4-NEXT: ; %bb.1: ; %ret
; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0
@ -619,7 +613,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-TRAP-GFX900-V2-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-TRAP-GFX900-V2-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX900-V2-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; HSA-TRAP-GFX900-V2-NEXT: s_and_b64 vcc, exec, vcc
; HSA-TRAP-GFX900-V2-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX900-V2-NEXT: ; %bb.1: ; %ret
; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 3
@ -638,7 +631,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; HSA-TRAP-GFX900-V3-NEXT: s_and_b64 vcc, exec, vcc
; HSA-TRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret
; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3
@ -657,7 +649,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; HSA-TRAP-GFX900-V4-NEXT: s_and_b64 vcc, exec, vcc
; HSA-TRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3
@ -743,7 +734,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-NOTRAP-GFX900-V2-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-NOTRAP-GFX900-V2-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-V2-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; HSA-NOTRAP-GFX900-V2-NEXT: s_and_b64 vcc, exec, vcc
; HSA-NOTRAP-GFX900-V2-NEXT: s_cbranch_vccz .LBB1_2
; HSA-NOTRAP-GFX900-V2-NEXT: ; %bb.1: ; %ret
; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 3
@ -761,7 +751,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-NOTRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; HSA-NOTRAP-GFX900-V3-NEXT: s_and_b64 vcc, exec, vcc
; HSA-NOTRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2
; HSA-NOTRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret
; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3
@ -779,7 +768,6 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
; HSA-NOTRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; HSA-NOTRAP-GFX900-V4-NEXT: s_and_b64 vcc, exec, vcc
; HSA-NOTRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2
; HSA-NOTRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3

View File

@ -190,7 +190,6 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow6
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
@ -879,7 +878,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: s_addc_u32 s7, s7, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[0:1]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3
; GCN-IR-NEXT: .LBB7_4: ; %Flow3
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
@ -1070,7 +1068,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-IR-NEXT: s_addc_u32 s7, s7, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3
; GCN-IR-NEXT: .LBB8_4: ; %Flow5
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
@ -1527,7 +1524,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3
; GCN-IR-NEXT: .LBB11_4: ; %Flow5
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1

View File

@ -118,7 +118,6 @@ done:
; Using a floating-point value in an integer compare will cause the compare to
; be selected for the SALU and then later moved to the VALU.
; GCN: v_cmp_ne_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
; GCN: s_and_b64 vcc, exec, [[COND]]
; GCN: s_cbranch_vccnz [[ENDIF_LABEL:.L[0-9_A-Za-z]+]]
; GCN: buffer_store_dword
; GCN: [[ENDIF_LABEL]]:
@ -143,7 +142,6 @@ endif:
; Using a floating-point value in an integer compare will cause the compare to
; be selected for the SALU and then later moved to the VALU.
; GCN: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
; GCN: s_and_b64 vcc, exec, [[COND]]
; GCN: s_cbranch_vccnz [[ENDIF_LABEL:.L[0-9_A-Za-z]+]]
; GCN: buffer_store_dword
; GCN: [[ENDIF_LABEL]]:

View File

@ -19,11 +19,9 @@ define hidden void @widget() {
; GCN-NEXT: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz .LBB0_3
; GCN-NEXT: ; %bb.1: ; %bb4
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB0_4
; GCN-NEXT: ; %bb.2: ; %bb7
; GCN-NEXT: s_getpc_b64 s[16:17]
@ -33,7 +31,6 @@ define hidden void @widget() {
; GCN-NEXT: s_branch .LBB0_7
; GCN-NEXT: .LBB0_3: ; %bb2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB0_6
; GCN-NEXT: .LBB0_4: ; %bb9
; GCN-NEXT: s_getpc_b64 s[16:17]

View File

@ -189,7 +189,6 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
; GCN-IR-NEXT: .LBB0_4: ; %Flow6
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[10:11], 1
@ -887,7 +886,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-IR-NEXT: s_addc_u32 s7, s7, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_3
; GCN-IR-NEXT: .LBB6_4: ; %Flow5
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1
@ -1077,7 +1075,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-IR-NEXT: s_addc_u32 s7, s7, 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3
; GCN-IR-NEXT: .LBB7_4: ; %Flow5
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1

View File

@ -495,9 +495,7 @@ entry:
; GCN-LABEL: {{^}}test_br_cc_f16:
; GFX1032: v_cmp_nlt_f16_e32 vcc_lo,
; GFX1032: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX1064: v_cmp_nlt_f16_e32 vcc,
; GFX1064: s_and_b64 vcc, exec, vcc{{$}}
; GCN-NEXT: s_cbranch_vccnz
define amdgpu_kernel void @test_br_cc_f16(
half addrspace(1)* %r,

View File

@ -1864,7 +1864,6 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6
; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7
; GFX9-W64-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1
; GFX9-W64-NEXT: ; %bb.3:
; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1
@ -1914,7 +1913,6 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2
; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1
; GFX10-W32-NEXT: ; %bb.3:
; GFX10-W32-NEXT: s_mov_b32 s1, -1