From f5abf0bd485a1fa7e332f5f8266c25755d385a8a Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 15 Jan 2021 18:17:31 +0000 Subject: [PATCH] [ARM] Tail predication with constant loop bounds The TripCount for a predicated vector loop body will be ceil(ElementCount/Width). This alters the conversion of an active.lane.mask to a VCPT intrinsics to match. Differential Revision: https://reviews.llvm.org/D94608 --- llvm/lib/Target/ARM/MVETailPredication.cpp | 20 ++-- .../Thumb2/LowOverheadLoops/constbound.ll | 97 +++---------------- .../LowOverheadLoops/tp-multiple-vpst.ll | 44 ++------- 3 files changed, 32 insertions(+), 129 deletions(-) diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 8055b5cf500d..b705208660df 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -230,18 +230,16 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, } // Calculate 2 tripcount values and check that they are consistent with - // each other: - // i) The number of loop iterations extracted from the set.loop.iterations - // intrinsic, multipled by the vector width: - uint64_t TC1 = TC->getZExtValue() * VectorWidth; + // each other. The TripCount for a predicated vector loop body is + // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we + // work it out here. + uint64_t TC1 = TC->getZExtValue(); + uint64_t TC2 = + (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; - // ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start - // counting from 0. - uint64_t TC2 = ConstElemCount->getZExtValue() + 1; - - // If the tripcount values are inconsistent, we don't want to insert the - // VCTP and trigger tail-predication; it's better to keep intrinsic - // get.active.lane.mask and legalize this. + // If the tripcount values are inconsistent, we can't insert the VCTP and + // trigger tail-predication; keep the intrinsic as a get.active.lane.mask + // and legalize this. if (TC1 != TC2) { LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " << TC1 << " from set.loop.iterations, and " diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll index 480680bee89d..d1f5a07bc4a9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -62,41 +62,17 @@ define dso_local i32 @test_501_504(i32* nocapture readonly %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adr r2, .LCPI1_0 -; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: adr r2, .LCPI1_1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: movw r1, #501 ; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q0, r1 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI1_1: -; CHECK-NEXT: .long 501 @ 0x1f5 -; CHECK-NEXT: .long 501 @ 0x1f5 -; CHECK-NEXT: .long 501 @ 0x1f5 -; CHECK-NEXT: .long 501 @ 0x1f5 entry: br label %vector.body @@ -123,41 +99,17 @@ define dso_local i32 @test_502_504(i32* nocapture readonly %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adr r2, .LCPI2_0 -; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: adr r2, .LCPI2_1 -; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov.w r1, #502 ; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q0, r1 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI2_1: -; CHECK-NEXT: .long 502 @ 0x1f6 -; CHECK-NEXT: .long 502 @ 0x1f6 -; CHECK-NEXT: .long 502 @ 0x1f6 -; CHECK-NEXT: .long 502 @ 0x1f6 entry: br label %vector.body @@ -221,36 +173,17 @@ define dso_local i32 @test_504_504(i32* nocapture readonly %x) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: adr r2, .LCPI4_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: mov.w r2, #504 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov.w r1, #504 ; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q0, r1 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 -; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll index 64c00ed03032..7777d7a6894a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll @@ -5,53 +5,25 @@ define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32 ; CHECK-LABEL: minmaxval4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: mov.w lr, #3 -; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: vmov.i32 q0, #0x80000000 ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov.i32 q3, #0xa +; CHECK-NEXT: movs r2, #10 +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vcmp.u32 hi, q5, q4 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.u32 hi, q3, q4 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.s32 gt, q4, q0 -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.s32 gt, q1, q4 -; CHECK-NEXT: vpsel q1, q4, q1 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vpt.s32 gt, q2, q0 +; CHECK-NEXT: vmovt q0, q2 +; CHECK-NEXT: vpt.s32 gt, q1, q2 +; CHECK-NEXT: vmovt q1, q2 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: mvn r0, #-2147483648 ; CHECK-NEXT: vminv.s32 r0, q1 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: mov.w r0, #-2147483648 ; CHECK-NEXT: vmaxv.s32 r0, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: br label %vector.body