forked from OSchip/llvm-project
[ARM] Tail predication with constant loop bounds
The TripCount for a predicated vector loop body will be ceil(ElementCount/Width). This alters the conversion of an active.lane.mask to a VCPT intrinsics to match. Differential Revision: https://reviews.llvm.org/D94608
This commit is contained in:
parent
a0770f9e4e
commit
f5abf0bd48
|
@ -230,18 +230,16 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
|
|||
}
|
||||
|
||||
// Calculate 2 tripcount values and check that they are consistent with
|
||||
// each other:
|
||||
// i) The number of loop iterations extracted from the set.loop.iterations
|
||||
// intrinsic, multipled by the vector width:
|
||||
uint64_t TC1 = TC->getZExtValue() * VectorWidth;
|
||||
// each other. The TripCount for a predicated vector loop body is
|
||||
// ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we
|
||||
// work it out here.
|
||||
uint64_t TC1 = TC->getZExtValue();
|
||||
uint64_t TC2 =
|
||||
(ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth;
|
||||
|
||||
// ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start
|
||||
// counting from 0.
|
||||
uint64_t TC2 = ConstElemCount->getZExtValue() + 1;
|
||||
|
||||
// If the tripcount values are inconsistent, we don't want to insert the
|
||||
// VCTP and trigger tail-predication; it's better to keep intrinsic
|
||||
// get.active.lane.mask and legalize this.
|
||||
// If the tripcount values are inconsistent, we can't insert the VCTP and
|
||||
// trigger tail-predication; keep the intrinsic as a get.active.lane.mask
|
||||
// and legalize this.
|
||||
if (TC1 != TC2) {
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
|
||||
<< TC1 << " from set.loop.iterations, and "
|
||||
|
|
|
@ -62,41 +62,17 @@ define dso_local i32 @test_501_504(i32* nocapture readonly %x) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adr r2, .LCPI1_0
|
||||
; CHECK-NEXT: mov.w lr, #126
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: adr r2, .LCPI1_1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: movs r1, #0
|
||||
; CHECK-NEXT: movw r1, #501
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: dlstp.32 lr, r1
|
||||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vadd.i32 q2, q0, r1
|
||||
; CHECK-NEXT: vdup.32 q3, r1
|
||||
; CHECK-NEXT: vcmp.u32 hi, q3, q2
|
||||
; CHECK-NEXT: adds r1, #4
|
||||
; CHECK-NEXT: vpnot
|
||||
; CHECK-NEXT: vpsttt
|
||||
; CHECK-NEXT: vcmpt.u32 hi, q1, q2
|
||||
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
|
||||
; CHECK-NEXT: vaddvat.u32 r2, q2
|
||||
; CHECK-NEXT: le lr, .LBB1_1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
||||
; CHECK-NEXT: vaddva.u32 r2, q0
|
||||
; CHECK-NEXT: letp lr, .LBB1_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: mov r0, r2
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
; CHECK-NEXT: .LCPI1_0:
|
||||
; CHECK-NEXT: .long 0 @ 0x0
|
||||
; CHECK-NEXT: .long 1 @ 0x1
|
||||
; CHECK-NEXT: .long 2 @ 0x2
|
||||
; CHECK-NEXT: .long 3 @ 0x3
|
||||
; CHECK-NEXT: .LCPI1_1:
|
||||
; CHECK-NEXT: .long 501 @ 0x1f5
|
||||
; CHECK-NEXT: .long 501 @ 0x1f5
|
||||
; CHECK-NEXT: .long 501 @ 0x1f5
|
||||
; CHECK-NEXT: .long 501 @ 0x1f5
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
|
@ -123,41 +99,17 @@ define dso_local i32 @test_502_504(i32* nocapture readonly %x) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adr r2, .LCPI2_0
|
||||
; CHECK-NEXT: mov.w lr, #126
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: adr r2, .LCPI2_1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: movs r1, #0
|
||||
; CHECK-NEXT: mov.w r1, #502
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: dlstp.32 lr, r1
|
||||
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vadd.i32 q2, q0, r1
|
||||
; CHECK-NEXT: vdup.32 q3, r1
|
||||
; CHECK-NEXT: vcmp.u32 hi, q3, q2
|
||||
; CHECK-NEXT: adds r1, #4
|
||||
; CHECK-NEXT: vpnot
|
||||
; CHECK-NEXT: vpsttt
|
||||
; CHECK-NEXT: vcmpt.u32 hi, q1, q2
|
||||
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
|
||||
; CHECK-NEXT: vaddvat.u32 r2, q2
|
||||
; CHECK-NEXT: le lr, .LBB2_1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
||||
; CHECK-NEXT: vaddva.u32 r2, q0
|
||||
; CHECK-NEXT: letp lr, .LBB2_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: mov r0, r2
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
; CHECK-NEXT: .LCPI2_0:
|
||||
; CHECK-NEXT: .long 0 @ 0x0
|
||||
; CHECK-NEXT: .long 1 @ 0x1
|
||||
; CHECK-NEXT: .long 2 @ 0x2
|
||||
; CHECK-NEXT: .long 3 @ 0x3
|
||||
; CHECK-NEXT: .LCPI2_1:
|
||||
; CHECK-NEXT: .long 502 @ 0x1f6
|
||||
; CHECK-NEXT: .long 502 @ 0x1f6
|
||||
; CHECK-NEXT: .long 502 @ 0x1f6
|
||||
; CHECK-NEXT: .long 502 @ 0x1f6
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
|
@ -221,36 +173,17 @@ define dso_local i32 @test_504_504(i32* nocapture readonly %x) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: mov.w lr, #126
|
||||
; CHECK-NEXT: adr r2, .LCPI4_0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: mov.w r2, #504
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: vdup.32 q1, r2
|
||||
; CHECK-NEXT: movs r1, #0
|
||||
; CHECK-NEXT: mov.w r1, #504
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: dlstp.32 lr, r1
|
||||
; CHECK-NEXT: .LBB4_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vadd.i32 q2, q0, r1
|
||||
; CHECK-NEXT: vdup.32 q3, r1
|
||||
; CHECK-NEXT: vcmp.u32 hi, q3, q2
|
||||
; CHECK-NEXT: adds r1, #4
|
||||
; CHECK-NEXT: vpnot
|
||||
; CHECK-NEXT: vpsttt
|
||||
; CHECK-NEXT: vcmpt.u32 hi, q1, q2
|
||||
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
|
||||
; CHECK-NEXT: vaddvat.u32 r2, q2
|
||||
; CHECK-NEXT: le lr, .LBB4_1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
||||
; CHECK-NEXT: vaddva.u32 r2, q0
|
||||
; CHECK-NEXT: letp lr, .LBB4_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: mov r0, r2
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
; CHECK-NEXT: .LCPI4_0:
|
||||
; CHECK-NEXT: .long 0 @ 0x0
|
||||
; CHECK-NEXT: .long 1 @ 0x1
|
||||
; CHECK-NEXT: .long 2 @ 0x2
|
||||
; CHECK-NEXT: .long 3 @ 0x3
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
|
|
|
@ -5,53 +5,25 @@ define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32
|
|||
; CHECK-LABEL: minmaxval4:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: sub sp, #8
|
||||
; CHECK-NEXT: mov.w lr, #3
|
||||
; CHECK-NEXT: adr r3, .LCPI0_0
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r3]
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x80000000
|
||||
; CHECK-NEXT: vmvn.i32 q1, #0x80000000
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: vmov.i32 q3, #0xa
|
||||
; CHECK-NEXT: movs r2, #10
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vadd.i32 q4, q2, r2
|
||||
; CHECK-NEXT: vdup.32 q5, r2
|
||||
; CHECK-NEXT: vcmp.u32 hi, q5, q4
|
||||
; CHECK-NEXT: adds r2, #4
|
||||
; CHECK-NEXT: vpnot
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vcmpt.u32 hi, q3, q4
|
||||
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
|
||||
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vcmpt.s32 gt, q4, q0
|
||||
; CHECK-NEXT: vpsel q0, q4, q0
|
||||
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vcmpt.s32 gt, q1, q4
|
||||
; CHECK-NEXT: vpsel q1, q4, q1
|
||||
; CHECK-NEXT: le lr, .LBB0_1
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
|
||||
; CHECK-NEXT: vpt.s32 gt, q2, q0
|
||||
; CHECK-NEXT: vmovt q0, q2
|
||||
; CHECK-NEXT: vpt.s32 gt, q1, q2
|
||||
; CHECK-NEXT: vmovt q1, q2
|
||||
; CHECK-NEXT: letp lr, .LBB0_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %middle.block
|
||||
; CHECK-NEXT: mvn r0, #-2147483648
|
||||
; CHECK-NEXT: vminv.s32 r0, q1
|
||||
; CHECK-NEXT: str r0, [r1]
|
||||
; CHECK-NEXT: mov.w r0, #-2147483648
|
||||
; CHECK-NEXT: vmaxv.s32 r0, q0
|
||||
; CHECK-NEXT: add sp, #8
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
; CHECK-NEXT: .LCPI0_0:
|
||||
; CHECK-NEXT: .long 0 @ 0x0
|
||||
; CHECK-NEXT: .long 1 @ 0x1
|
||||
; CHECK-NEXT: .long 2 @ 0x2
|
||||
; CHECK-NEXT: .long 3 @ 0x3
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
|
|
Loading…
Reference in New Issue