[ARM] Tail predication with constant loop bounds

The TripCount for a predicated vector loop body will be
ceil(ElementCount/Width). This alters the conversion of an
active.lane.mask to a VCPT intrinsics to match.

Differential Revision: https://reviews.llvm.org/D94608
This commit is contained in:
David Green 2021-01-15 18:17:31 +00:00
parent a0770f9e4e
commit f5abf0bd48
3 changed files with 32 additions and 129 deletions

View File

@ -230,18 +230,16 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
}
// Calculate 2 tripcount values and check that they are consistent with
// each other:
// i) The number of loop iterations extracted from the set.loop.iterations
// intrinsic, multipled by the vector width:
uint64_t TC1 = TC->getZExtValue() * VectorWidth;
// each other. The TripCount for a predicated vector loop body is
// ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we
// work it out here.
uint64_t TC1 = TC->getZExtValue();
uint64_t TC2 =
(ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth;
// ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start
// counting from 0.
uint64_t TC2 = ConstElemCount->getZExtValue() + 1;
// If the tripcount values are inconsistent, we don't want to insert the
// VCTP and trigger tail-predication; it's better to keep intrinsic
// get.active.lane.mask and legalize this.
// If the tripcount values are inconsistent, we can't insert the VCTP and
// trigger tail-predication; keep the intrinsic as a get.active.lane.mask
// and legalize this.
if (TC1 != TC2) {
LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
<< TC1 << " from set.loop.iterations, and "

View File

@ -62,41 +62,17 @@ define dso_local i32 @test_501_504(i32* nocapture readonly %x) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: adr r2, .LCPI1_0
; CHECK-NEXT: mov.w lr, #126
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: adr r2, .LCPI1_1
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: movw r1, #501
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q2, q0, r1
; CHECK-NEXT: vdup.32 q3, r1
; CHECK-NEXT: vcmp.u32 hi, q3, q2
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vcmpt.u32 hi, q1, q2
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vaddvat.u32 r2, q2
; CHECK-NEXT: le lr, .LBB1_1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI1_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
; CHECK-NEXT: .LCPI1_1:
; CHECK-NEXT: .long 501 @ 0x1f5
; CHECK-NEXT: .long 501 @ 0x1f5
; CHECK-NEXT: .long 501 @ 0x1f5
; CHECK-NEXT: .long 501 @ 0x1f5
entry:
br label %vector.body
@ -123,41 +99,17 @@ define dso_local i32 @test_502_504(i32* nocapture readonly %x) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: adr r2, .LCPI2_0
; CHECK-NEXT: mov.w lr, #126
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: adr r2, .LCPI2_1
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: mov.w r1, #502
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q2, q0, r1
; CHECK-NEXT: vdup.32 q3, r1
; CHECK-NEXT: vcmp.u32 hi, q3, q2
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vcmpt.u32 hi, q1, q2
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vaddvat.u32 r2, q2
; CHECK-NEXT: le lr, .LBB2_1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: letp lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI2_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
; CHECK-NEXT: .LCPI2_1:
; CHECK-NEXT: .long 502 @ 0x1f6
; CHECK-NEXT: .long 502 @ 0x1f6
; CHECK-NEXT: .long 502 @ 0x1f6
; CHECK-NEXT: .long 502 @ 0x1f6
entry:
br label %vector.body
@ -221,36 +173,17 @@ define dso_local i32 @test_504_504(i32* nocapture readonly %x) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: mov.w lr, #126
; CHECK-NEXT: adr r2, .LCPI4_0
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: mov.w r2, #504
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: vdup.32 q1, r2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: mov.w r1, #504
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q2, q0, r1
; CHECK-NEXT: vdup.32 q3, r1
; CHECK-NEXT: vcmp.u32 hi, q3, q2
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vcmpt.u32 hi, q1, q2
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vaddvat.u32 r2, q2
; CHECK-NEXT: le lr, .LBB4_1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI4_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
entry:
br label %vector.body

View File

@ -5,53 +5,25 @@ define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32
; CHECK-LABEL: minmaxval4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: mov.w lr, #3
; CHECK-NEXT: adr r3, .LCPI0_0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: vmov.i32 q0, #0x80000000
; CHECK-NEXT: vmvn.i32 q1, #0x80000000
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vmov.i32 q3, #0xa
; CHECK-NEXT: movs r2, #10
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q4, q2, r2
; CHECK-NEXT: vdup.32 q5, r2
; CHECK-NEXT: vcmp.u32 hi, q5, q4
; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.u32 hi, q3, q4
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.s32 gt, q4, q0
; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.s32 gt, q1, q4
; CHECK-NEXT: vpsel q1, q4, q1
; CHECK-NEXT: le lr, .LBB0_1
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
; CHECK-NEXT: vpt.s32 gt, q2, q0
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vpt.s32 gt, q1, q2
; CHECK-NEXT: vmovt q1, q2
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: vminv.s32 r0, q1
; CHECK-NEXT: str r0, [r1]
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: vmaxv.s32 r0, q0
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
entry:
br label %vector.body