[ARM] Ensure WLS preheader blocks have branches during memcpy lowering

This makes sure that the blocks created for lowering memcpy to loops end
up with branches, even if they fall through to the successor. Otherwise
IfCvt is getting confused with unanalyzable branches and creating
invalid block layouts.

The extra branches should be removed as the tail predicated loop is
finalized in almost all cases.
This commit is contained in:
David Green 2021-05-24 11:26:45 +01:00
parent 6cc78b9245
commit 53c42f7700
3 changed files with 379 additions and 0 deletions

View File

@ -11133,6 +11133,10 @@ static Register genTPEntry(MachineBasicBlock *TpEntry,
.addUse(TotalIterationsReg)
.addMBB(TpExit);
BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
.addMBB(TpLoopBody)
.add(predOps(ARMCC::AL));
return TotalIterationsReg;
}

View File

@ -0,0 +1,374 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -arm-memtransfer-tploop=force-enabled %s -o - | FileCheck %s
; In this test, the successors of various blocks were becoming invalid after
; ifcvt as the blocks did not properly fall through to the successor after a
; WhileLoopStart
@arr_183 = external dso_local local_unnamed_addr global [20 x [23 x [19 x i8]]], align 1
define i32 @a(i8 zeroext %b, [3 x i8]* nocapture readonly %c, [3 x i32]* nocapture readonly %d) {
; CHECK-LABEL: a:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: bls.w .LBB0_11
; CHECK-NEXT: @ %bb.1: @ %for.body.us.preheader
; CHECK-NEXT: movw r5, :lower16:arr_183
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movt r5, :upper16:arr_183
; CHECK-NEXT: mov.w r12, #19
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_2: @ %land.end.us.3
; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: .LBB0_3: @ %for.body.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
; CHECK-NEXT: @ Child Loop BB0_6 Depth 2
; CHECK-NEXT: @ Child Loop BB0_8 Depth 2
; CHECK-NEXT: @ Child Loop BB0_10 Depth 2
; CHECK-NEXT: ldr.w r0, [r2, r3, lsl #2]
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r0, [r1, r3]
; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: mla r3, r3, r12, r5
; CHECK-NEXT: add r3, r0
; CHECK-NEXT: rsb.w r0, r0, #108
; CHECK-NEXT: wlstp.8 lr, r0, .LBB0_5
; CHECK-NEXT: .LBB0_4: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB0_4
; CHECK-NEXT: .LBB0_5: @ %land.end.us
; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: ldr r0, [r2, #4]
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r0, [r1, #1]
; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: adds r3, r5, r0
; CHECK-NEXT: rsb.w r0, r0, #108
; CHECK-NEXT: adds r3, #19
; CHECK-NEXT: wlstp.8 lr, r0, .LBB0_7
; CHECK-NEXT: .LBB0_6: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q1, [r3], #16
; CHECK-NEXT: letp lr, .LBB0_6
; CHECK-NEXT: .LBB0_7: @ %land.end.us.1
; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: ldr r0, [r2, #4]
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r0, [r1, #1]
; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: adds r3, r5, r0
; CHECK-NEXT: rsb.w r0, r0, #108
; CHECK-NEXT: adds r3, #19
; CHECK-NEXT: wlstp.8 lr, r0, .LBB0_9
; CHECK-NEXT: .LBB0_8: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q2, [r3], #16
; CHECK-NEXT: letp lr, .LBB0_8
; CHECK-NEXT: .LBB0_9: @ %land.end.us.2
; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: ldr r0, [r2, #4]
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r0, [r1, #1]
; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: adds r3, r5, r0
; CHECK-NEXT: rsb.w r0, r0, #108
; CHECK-NEXT: add.w r4, r0, #15
; CHECK-NEXT: adds r3, #19
; CHECK-NEXT: lsrs r4, r4, #4
; CHECK-NEXT: subs.w lr, r4, #0
; CHECK-NEXT: beq .LBB0_2
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_10: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: subs r0, #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q3, [r3], #16
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB0_10
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_11:
; CHECK-NEXT: movw r12, :lower16:arr_183
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: movt r12, :upper16:arr_183
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: b .LBB0_13
; CHECK-NEXT: .LBB0_12: @ %for.body.lr.ph.3
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r3, [r1, #1]
; CHECK-NEXT: moveq r3, #0
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: add.w r5, r3, #15
; CHECK-NEXT: lsrs r5, r5, #4
; CHECK-NEXT: subs.w lr, r5, #0
; CHECK-NEXT: beq .LBB0_13
; CHECK-NEXT: b .LBB0_23
; CHECK-NEXT: .LBB0_13: @ %for.cond
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_15 Depth 2
; CHECK-NEXT: @ Child Loop BB0_18 Depth 2
; CHECK-NEXT: @ Child Loop BB0_21 Depth 2
; CHECK-NEXT: @ Child Loop BB0_23 Depth 2
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_16
; CHECK-NEXT: @ %bb.14: @ %for.body.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r3, [r1, #1]
; CHECK-NEXT: moveq r3, #0
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_16
; CHECK-NEXT: .LBB0_15: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q0, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_15
; CHECK-NEXT: .LBB0_16: @ %for.cond.backedge
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_19
; CHECK-NEXT: @ %bb.17: @ %for.body.lr.ph.1
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r3, [r1, #1]
; CHECK-NEXT: moveq r3, #0
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_19
; CHECK-NEXT: .LBB0_18: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q1, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_18
; CHECK-NEXT: .LBB0_19: @ %for.cond.backedge.1
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_22
; CHECK-NEXT: @ %bb.20: @ %for.body.lr.ph.2
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: ldr r3, [r2, #4]
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: ite ne
; CHECK-NEXT: ldrbne r3, [r1, #1]
; CHECK-NEXT: moveq r3, #0
; CHECK-NEXT: add.w r5, r12, r3
; CHECK-NEXT: rsb.w r3, r3, #108
; CHECK-NEXT: add.w r4, r5, #19
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_22
; CHECK-NEXT: .LBB0_21: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q2, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_21
; CHECK-NEXT: .LBB0_22: @ %for.cond.backedge.2
; CHECK-NEXT: @ in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: cmp r0, #2
; CHECK-NEXT: blo .LBB0_13
; CHECK-NEXT: b .LBB0_12
; CHECK-NEXT: .LBB0_23: @ Parent Loop BB0_13 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.8 r3
; CHECK-NEXT: subs r3, #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q3, [r4], #16
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB0_23
; CHECK-NEXT: b .LBB0_13
entry:
%cmp = icmp ugt i8 %b, 1
br i1 %cmp, label %for.body.us.preheader, label %for.cond.preheader
for.cond.preheader: ; preds = %entry
%cmp43 = icmp ugt i8 %b, 1
%arrayidx6 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
%cmp43.1 = icmp ugt i8 %b, 1
%arrayidx6.1 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12.1 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
%cmp43.2 = icmp ugt i8 %b, 1
%arrayidx6.2 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12.2 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
%cmp43.3 = icmp ugt i8 %b, 1
%arrayidx6.3 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12.3 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
br label %for.cond
for.body.us.preheader: ; preds = %entry
%arrayidx6.us.1 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12.us.1 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
%arrayidx6.us.2 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12.us.2 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
%arrayidx6.us.3 = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 1
%arrayidx12.us.3 = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 1
br label %for.body.us
for.cond: ; preds = %for.cond.backedge.3, %for.cond.preheader
br i1 %cmp43, label %for.body.lr.ph, label %for.cond.backedge
for.body.lr.ph: ; preds = %for.cond
%0 = load i32, i32* %arrayidx6, align 4
%tobool7.not = icmp eq i32 %0, 0
br i1 %tobool7.not, label %land.end, label %land.rhs
for.body.us: ; preds = %land.end.us.3, %for.body.us.preheader
%conv44.us = phi i32 [ 0, %for.body.us.preheader ], [ 1, %land.end.us.3 ]
%arrayidx6.us = getelementptr inbounds [3 x i32], [3 x i32]* %d, i32 0, i32 %conv44.us
%1 = load i32, i32* %arrayidx6.us, align 4
%tobool7.not.us = icmp eq i32 %1, 0
br i1 %tobool7.not.us, label %land.end.us, label %land.rhs.us
land.rhs.us: ; preds = %for.body.us
%arrayidx12.us = getelementptr inbounds [3 x i8], [3 x i8]* %c, i32 0, i32 %conv44.us
%2 = load i8, i8* %arrayidx12.us, align 1
%tobool13.us = zext i8 %2 to i32
br label %land.end.us
land.end.us: ; preds = %land.rhs.us, %for.body.us
%3 = phi i32 [ 0, %for.body.us ], [ %tobool13.us, %land.rhs.us ]
%scevgep45 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 %conv44.us, i32 %3
%4 = sub nuw nsw i32 108, %3
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep45, i8 0, i32 %4, i1 false)
%5 = load i32, i32* %arrayidx6.us.1, align 4
%tobool7.not.us.1 = icmp eq i32 %5, 0
br i1 %tobool7.not.us.1, label %land.end.us.1, label %land.rhs.us.1
land.rhs: ; preds = %for.body.lr.ph
%6 = load i8, i8* %arrayidx12, align 1
%tobool13 = zext i8 %6 to i32
br label %land.end
land.end: ; preds = %land.rhs, %for.body.lr.ph
%7 = phi i32 [ 0, %for.body.lr.ph ], [ %tobool13, %land.rhs ]
%scevgep = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %7
%8 = sub nuw nsw i32 108, %7
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep, i8 0, i32 %8, i1 false)
br label %for.cond.backedge
for.cond.backedge: ; preds = %land.end, %for.cond
br i1 %cmp43.1, label %for.body.lr.ph.1, label %for.cond.backedge.1
for.body.lr.ph.1: ; preds = %for.cond.backedge
%9 = load i32, i32* %arrayidx6.1, align 4
%tobool7.not.1 = icmp eq i32 %9, 0
br i1 %tobool7.not.1, label %land.end.1, label %land.rhs.1
land.rhs.1: ; preds = %for.body.lr.ph.1
%10 = load i8, i8* %arrayidx12.1, align 1
%tobool13.1 = zext i8 %10 to i32
br label %land.end.1
land.end.1: ; preds = %land.rhs.1, %for.body.lr.ph.1
%11 = phi i32 [ 0, %for.body.lr.ph.1 ], [ %tobool13.1, %land.rhs.1 ]
%scevgep.1 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %11
%12 = sub nuw nsw i32 108, %11
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep.1, i8 0, i32 %12, i1 false)
br label %for.cond.backedge.1
for.cond.backedge.1: ; preds = %land.end.1, %for.cond.backedge
br i1 %cmp43.2, label %for.body.lr.ph.2, label %for.cond.backedge.2
for.body.lr.ph.2: ; preds = %for.cond.backedge.1
%13 = load i32, i32* %arrayidx6.2, align 4
%tobool7.not.2 = icmp eq i32 %13, 0
br i1 %tobool7.not.2, label %land.end.2, label %land.rhs.2
land.rhs.2: ; preds = %for.body.lr.ph.2
%14 = load i8, i8* %arrayidx12.2, align 1
%tobool13.2 = zext i8 %14 to i32
br label %land.end.2
land.end.2: ; preds = %land.rhs.2, %for.body.lr.ph.2
%15 = phi i32 [ 0, %for.body.lr.ph.2 ], [ %tobool13.2, %land.rhs.2 ]
%scevgep.2 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %15
%16 = sub nuw nsw i32 108, %15
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep.2, i8 0, i32 %16, i1 false)
br label %for.cond.backedge.2
for.cond.backedge.2: ; preds = %land.end.2, %for.cond.backedge.1
br i1 %cmp43.3, label %for.body.lr.ph.3, label %for.cond.backedge.3
for.body.lr.ph.3: ; preds = %for.cond.backedge.2
%17 = load i32, i32* %arrayidx6.3, align 4
%tobool7.not.3 = icmp eq i32 %17, 0
br i1 %tobool7.not.3, label %land.end.3, label %land.rhs.3
land.rhs.3: ; preds = %for.body.lr.ph.3
%18 = load i8, i8* %arrayidx12.3, align 1
%tobool13.3 = zext i8 %18 to i32
br label %land.end.3
land.end.3: ; preds = %land.rhs.3, %for.body.lr.ph.3
%19 = phi i32 [ 0, %for.body.lr.ph.3 ], [ %tobool13.3, %land.rhs.3 ]
%scevgep.3 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %19
%20 = sub nuw nsw i32 108, %19
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep.3, i8 0, i32 %20, i1 false)
br label %for.cond.backedge.3
for.cond.backedge.3: ; preds = %land.end.3, %for.cond.backedge.2
br label %for.cond
land.rhs.us.1: ; preds = %land.end.us
%21 = load i8, i8* %arrayidx12.us.1, align 1
%tobool13.us.1 = zext i8 %21 to i32
br label %land.end.us.1
land.end.us.1: ; preds = %land.rhs.us.1, %land.end.us
%22 = phi i32 [ 0, %land.end.us ], [ %tobool13.us.1, %land.rhs.us.1 ]
%scevgep45.1 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %22
%23 = sub nuw nsw i32 108, %22
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep45.1, i8 0, i32 %23, i1 false)
%24 = load i32, i32* %arrayidx6.us.2, align 4
%tobool7.not.us.2 = icmp eq i32 %24, 0
br i1 %tobool7.not.us.2, label %land.end.us.2, label %land.rhs.us.2
land.rhs.us.2: ; preds = %land.end.us.1
%25 = load i8, i8* %arrayidx12.us.2, align 1
%tobool13.us.2 = zext i8 %25 to i32
br label %land.end.us.2
land.end.us.2: ; preds = %land.rhs.us.2, %land.end.us.1
%26 = phi i32 [ 0, %land.end.us.1 ], [ %tobool13.us.2, %land.rhs.us.2 ]
%scevgep45.2 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %26
%27 = sub nuw nsw i32 108, %26
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep45.2, i8 0, i32 %27, i1 false)
%28 = load i32, i32* %arrayidx6.us.3, align 4
%tobool7.not.us.3 = icmp eq i32 %28, 0
br i1 %tobool7.not.us.3, label %land.end.us.3, label %land.rhs.us.3
land.rhs.us.3: ; preds = %land.end.us.2
%29 = load i8, i8* %arrayidx12.us.3, align 1
%tobool13.us.3 = zext i8 %29 to i32
br label %land.end.us.3
land.end.us.3: ; preds = %land.rhs.us.3, %land.end.us.2
%30 = phi i32 [ 0, %land.end.us.2 ], [ %tobool13.us.3, %land.rhs.us.3 ]
%scevgep45.3 = getelementptr [20 x [23 x [19 x i8]]], [20 x [23 x [19 x i8]]]* @arr_183, i32 0, i32 0, i32 1, i32 %30
%31 = sub nuw nsw i32 108, %30
call void @llvm.memset.p0i8.i32(i8* align 1 %scevgep45.3, i8 0, i32 %31, i1 false)
br label %for.body.us
}
declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i1)

View File

@ -240,6 +240,7 @@ define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: subs.w lr, lr, #0
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: b .LBB10_2
; CHECK-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r3
; CHECK-NEXT: subs r3, #16