[AMDGPU] Improve killed check for vgpr optimization

The killed flag is not always set. E.g. when a variable is used in a
loop, it is never marked as killed, although it is unused in following
basic blocks. Also, we try to deprecate kill flags and not use them.

Check if the register is live in the endif block. If not, consider it
killed in the then and else blocks.

The vgpr-liverange tests have two new tests with loops
(pre-committed, so the diff is visible).
I also needed to change the subtarget to gfx10.1, otherwise calls
are not working.

Differential Revision: https://reviews.llvm.org/D106291
This commit is contained in:
Sebastian Neubauer 2021-07-21 15:20:10 +02:00
parent aba1f157ca
commit b642d01fa8
8 changed files with 70 additions and 55 deletions

View File

@ -230,15 +230,24 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
continue;
if (MO.isKill() && MO.readsReg()) {
if (MO.readsReg()) {
LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
// Make sure two conditions are met:
// a.) the value is defined before/in the IF block
// b.) should be defined in the same loop-level.
if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
KillsInElse.insert(MOReg);
Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) {
// Check if the register is live into the endif block. If not,
// consider it killed in the else region.
LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
if (!VI.isLiveIn(*Endif, MOReg, *MRI)) {
KillsInElse.insert(MOReg);
} else {
LLVM_DEBUG(dbgs() << "Excluding " << printReg(MOReg, TRI)
<< " as Live in Endif\n");
}
}
}
}
}

View File

@ -158,8 +158,8 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2
; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v2, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -844,8 +844,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v4, v5
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -1023,8 +1023,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5
; CGP-NEXT: v_xor_b32_e32 v3, v3, v5
; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -2661,8 +2661,8 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3
; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -3149,8 +3149,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v4, v5
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -3328,8 +3328,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5
; CGP-NEXT: v_xor_b32_e32 v3, v3, v5
; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]

View File

@ -156,8 +156,8 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v6, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -830,8 +830,8 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v4, v0, v10
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -1005,8 +1005,8 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8
; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -2623,8 +2623,8 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v6
; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v6, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -3103,8 +3103,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v4, v0, v10
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -3278,8 +3278,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8
; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]

View File

@ -143,8 +143,8 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -765,8 +765,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -929,8 +929,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -2428,8 +2428,8 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -2871,8 +2871,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@ -3035,8 +3035,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]

View File

@ -142,8 +142,8 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -756,8 +756,8 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -917,8 +917,8 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -1787,8 +1787,8 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -2225,8 +2225,8 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -2386,8 +2386,8 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: ; implicit-def: $vgpr2
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]

View File

@ -136,8 +136,8 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v1, v2
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: BB0_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -289,8 +289,8 @@ define i64 @udiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: BB1_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -455,8 +455,8 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v6
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v6, vcc
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: BB2_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -605,8 +605,8 @@ define i64 @urem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v9, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: BB3_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -908,8 +908,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v0, v7
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: BB8_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[10:11]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@ -1077,8 +1077,8 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: BB9_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]

View File

@ -205,27 +205,28 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
; SI: bb.1.Flow:
; SI: successors: %bb.2(0x40000000), %bb.8(0x40000000)
; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.7
; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %45:vgpr_32, %bb.7
; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %47:vgpr_32, %bb.7
; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.7
; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.7
; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.7
; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI: S_BRANCH %bb.2
; SI: bb.2.if:
; SI: successors: %bb.3(0x80000000)
; SI: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1
; SI: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1
; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; SI: bb.3:
; SI: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; SI: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %49:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2
; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI undef %51:vgpr_32, %bb.3, [[COPY4]], %bb.2
; SI: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec
; SI: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec
; SI: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2
; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.3, [[PHI1]], %bb.2
; SI: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
; SI: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
; SI: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
; SI: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI3]], implicit $exec
; SI: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI4]], implicit $exec
; SI: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec
; SI: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; SI: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; SI: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
; SI: $vgpr0 = COPY killed [[PHI4]]
; SI: $vgpr0 = COPY killed [[PHI5]]
; SI: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
; SI: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; SI: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
@ -242,16 +243,17 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
; SI: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; SI: bb.6:
; SI: successors: %bb.6(0x40000000), %bb.7(0x40000000)
; SI: [[PHI5:%[0-9]+]]:vreg_64 = PHI undef %53:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5
; SI: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI5]].sub0, implicit $exec
; SI: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI5]].sub1, implicit $exec
; SI: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5
; SI: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.6, [[COPY4]], %bb.5
; SI: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec
; SI: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec
; SI: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
; SI: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI5]], implicit $exec
; SI: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI6]], implicit $exec
; SI: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec
; SI: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; SI: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; SI: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
; SI: $vgpr0 = COPY [[COPY4]]
; SI: $vgpr0 = COPY killed [[PHI7]]
; SI: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
; SI: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; SI: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
@ -263,9 +265,9 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
; SI: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
; SI: S_BRANCH %bb.1
; SI: bb.8.end:
; SI: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4
; SI: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4
; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI: $vgpr0 = COPY killed [[PHI6]]
; SI: $vgpr0 = COPY killed [[PHI8]]
; SI: SI_RETURN_TO_EPILOG killed $vgpr0
main_body:
%cc = icmp sgt i32 %z, 5

View File

@ -157,15 +157,16 @@ for.end:
define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
; SI-LABEL: loop:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s38, -1
; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: v_mov_b32_e32 v40, v1
; SI-NEXT: v_mov_b32_e32 v0, v1
; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: s_mov_b32 s39, 0x31c16000
; SI-NEXT: s_add_u32 s36, s36, s1
; SI-NEXT: s_addc_u32 s37, s37, 0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_mov_b32 s32, 0
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s33, exec_lo, s0
@ -177,15 +178,17 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
; SI-NEXT: v_readfirstlane_b32 s5, v5
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo
; SI-NEXT: v_mov_b32_e32 v0, v40
; SI-NEXT: s_mov_b64 s[0:1], s[36:37]
; SI-NEXT: s_mov_b64 s[2:3], s[38:39]
; SI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: ; implicit-def: $vgpr4_vgpr5
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35
; SI-NEXT: s_cbranch_execnz BB3_2
; SI-NEXT: ; %bb.3:
; SI-NEXT: s_mov_b32 exec_lo, s34
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: BB3_4: ; %Flow
; SI-NEXT: s_or_saveexec_b32 s33, s33
@ -198,18 +201,19 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
; SI-NEXT: v_readfirstlane_b32 s5, v3
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo
; SI-NEXT: v_mov_b32_e32 v0, v40
; SI-NEXT: s_mov_b64 s[0:1], s[36:37]
; SI-NEXT: s_mov_b64 s[2:3], s[38:39]
; SI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: ; implicit-def: $vgpr2_vgpr3
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35
; SI-NEXT: s_cbranch_execnz BB3_6
; SI-NEXT: ; %bb.7:
; SI-NEXT: s_mov_b32 exec_lo, s34
; SI-NEXT: BB3_8: ; %end
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s33
; SI-NEXT: v_mov_b32_e32 v0, v1
; SI-NEXT: ; return to shader part epilog
main_body:
%cc = icmp sgt i32 %z, 5