2020-06-09 18:04:29 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2020-08-08 00:16:56 +08:00
|
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s
|
2020-06-09 18:04:29 +08:00
|
|
|
|
|
|
|
define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB0_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB0_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: movs r0, #0
|
|
|
|
; CHECK-NEXT: b .LBB0_7
|
|
|
|
; CHECK-NEXT: .LBB0_3:
|
|
|
|
; CHECK-NEXT: movs r0, #0
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB0_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB0_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: subs r0, r3, #4
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r0, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: mov r2, r12
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB0_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
|
|
|
; CHECK-NEXT: vaddva.u32 r0, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB0_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB0_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r2], #4
|
|
|
|
; CHECK-NEXT: add r0, r1
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB0_8
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
2020-10-03 09:30:53 +08:00
|
|
|
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
|
2020-06-09 18:04:29 +08:00
|
|
|
%3 = add i32 %2, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
|
|
%add = add nsw i32 %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: mul_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB1_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB1_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: b .LBB1_6
|
|
|
|
; CHECK-NEXT: .LBB1_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r12, r1, #3
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x1
|
|
|
|
; CHECK-NEXT: sub.w r3, r12, #4
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r3, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB1_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB1_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r2, s3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r12, r1
|
|
|
|
; CHECK-NEXT: vmov r3, s2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: mul lr, r3, r2
|
|
|
|
; CHECK-NEXT: vmov r3, s1
|
|
|
|
; CHECK-NEXT: vmov r2, s0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mul r2, r3, r2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: mul r2, r2, lr
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: beq .LBB1_8
|
|
|
|
; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r12
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB1_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: muls r2, r1, r2
|
|
|
|
; CHECK-NEXT: le lr, .LBB1_7
|
|
|
|
; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = mul <4 x i32> %wide.load, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
|
|
%add = mul nsw i32 %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: and_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB2_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB2_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB2_7
|
|
|
|
; CHECK-NEXT: .LBB2_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
|
|
; CHECK-NEXT: b .LBB2_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB2_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmov.i8 q0, #0xff
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB2_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vand q0, q1, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB2_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r12, s3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vmov r2, s2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov lr, s1
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: and.w r12, r12, r2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r2, s0
|
|
|
|
; CHECK-NEXT: and.w r2, r2, lr
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: and.w r2, r2, r12
|
|
|
|
; CHECK-NEXT: beq .LBB2_9
|
|
|
|
; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB2_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: ands r2, r1
|
|
|
|
; CHECK-NEXT: le lr, .LBB2_8
|
|
|
|
; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = and <4 x i32> %wide.load, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
|
|
%add = and i32 %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: or_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB3_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB3_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB3_7
|
|
|
|
; CHECK-NEXT: .LBB3_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB3_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB3_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB3_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vorr q0, q1, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB3_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r12, s3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vmov r2, s2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov lr, s1
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: orr.w r12, r12, r2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r2, s0
|
|
|
|
; CHECK-NEXT: orr.w r2, r2, lr
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: orr.w r2, r2, r12
|
|
|
|
; CHECK-NEXT: beq .LBB3_9
|
|
|
|
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB3_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: orrs r2, r1
|
|
|
|
; CHECK-NEXT: le lr, .LBB3_8
|
|
|
|
; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = or <4 x i32> %wide.load, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
|
|
%add = or i32 %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: xor_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB4_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB4_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB4_7
|
|
|
|
; CHECK-NEXT: .LBB4_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB4_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB4_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB4_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: veor q0, q1, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB4_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r12, s3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vmov r2, s2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov lr, s1
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: eor.w r12, r12, r2
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmov r2, s0
|
|
|
|
; CHECK-NEXT: eor.w r2, r2, lr
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: eor.w r2, r2, r12
|
|
|
|
; CHECK-NEXT: beq .LBB4_9
|
|
|
|
; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB4_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: eors r2, r1
|
|
|
|
; CHECK-NEXT: le lr, .LBB4_8
|
|
|
|
; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = xor <4 x i32> %wide.load, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
|
|
%add = xor i32 %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: fadd_f32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB5_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB5_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: vldr s0, .LCPI5_0
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB5_7
|
|
|
|
; CHECK-NEXT: .LBB5_3:
|
|
|
|
; CHECK-NEXT: vldr s0, .LCPI5_0
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB5_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB5_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
|
|
; CHECK-NEXT: movs r3, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r3, r0
|
|
|
|
; CHECK-NEXT: .LBB5_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
|
|
; CHECK-NEXT: vadd.f32 q0, q1, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB5_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s4, s2, s3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r2, r1
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: beq .LBB5_9
|
|
|
|
; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB5_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldr s2, [r0]
|
|
|
|
; CHECK-NEXT: adds r0, #4
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
|
|
; CHECK-NEXT: le lr, .LBB5_8
|
|
|
|
; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vmov r0, s0
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .p2align 2
|
|
|
|
; CHECK-NEXT: @ %bb.10:
|
|
|
|
; CHECK-NEXT: .LCPI5_0:
|
|
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
|
|
%2 = fadd fast <4 x float> %wide.load, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
|
|
%5 = load float, float* %arrayidx, align 4
|
|
|
|
%add = fadd fast float %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret float %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: fmul_f32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB6_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB6_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB6_7
|
|
|
|
; CHECK-NEXT: .LBB6_3:
|
|
|
|
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB6_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB6_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
|
|
; CHECK-NEXT: movs r3, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
|
|
; CHECK-NEXT: vmov.f32 q0, #1.000000e+00
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r3, r0
|
|
|
|
; CHECK-NEXT: .LBB6_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
|
|
; CHECK-NEXT: vmul.f32 q0, q1, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB6_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmul.f32 s4, s2, s3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r2, r1
|
2020-06-29 20:53:19 +08:00
|
|
|
; CHECK-NEXT: vmul.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vmul.f32 s0, s0, s4
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: beq .LBB6_9
|
|
|
|
; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB6_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldr s2, [r0]
|
|
|
|
; CHECK-NEXT: adds r0, #4
|
|
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
|
|
; CHECK-NEXT: le lr, .LBB6_8
|
|
|
|
; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vmov r0, s0
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
|
|
%2 = fmul fast <4 x float> %wide.load, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
|
|
%5 = load float, float* %arrayidx, align 4
|
|
|
|
%add = fmul fast float %5, %r.07
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret float %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: smin_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB7_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB7_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB7_7
|
|
|
|
; CHECK-NEXT: .LBB7_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
|
|
; CHECK-NEXT: b .LBB7_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB7_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmvn.i32 q0, #0x80000000
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB7_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vmin.s32 q0, q0, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB7_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vminv.s32 r2, q0
|
|
|
|
; CHECK-NEXT: beq .LBB7_9
|
|
|
|
; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB7_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: cmp r2, r1
|
2020-07-14 17:04:55 +08:00
|
|
|
; CHECK-NEXT: csel r2, r2, r1, lt
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB7_8
|
|
|
|
; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = icmp slt <4 x i32> %vec.phi, %wide.load
|
|
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp slt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: smin_i32_inloop:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB8_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB8_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: mvn r0, #-2147483648
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB8_7
|
|
|
|
; CHECK-NEXT: .LBB8_3:
|
|
|
|
; CHECK-NEXT: mvn r0, #-2147483648
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB8_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB8_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: subs r0, r3, #4
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mvn r0, #-2147483648
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: mov r2, r12
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB8_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: vminv.s32 r0, q0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB8_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: it eq
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: popeq {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB8_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r2], #4
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: csel r0, r0, r1, lt
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB8_8
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
2020-10-03 09:30:53 +08:00
|
|
|
%l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
|
2020-06-09 18:04:29 +08:00
|
|
|
%2 = icmp slt i32 %vec.phi, %l5
|
|
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp slt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: smax_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB9_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB9_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mov.w r2, #-2147483648
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB9_7
|
|
|
|
; CHECK-NEXT: .LBB9_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mov.w r2, #-2147483648
|
|
|
|
; CHECK-NEXT: b .LBB9_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB9_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x80000000
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB9_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vmax.s32 q0, q0, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB9_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: mov.w r2, #-2147483648
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vmaxv.s32 r2, q0
|
|
|
|
; CHECK-NEXT: beq .LBB9_9
|
|
|
|
; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB9_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: cmp r2, r1
|
2020-07-14 17:04:55 +08:00
|
|
|
; CHECK-NEXT: csel r2, r2, r1, gt
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB9_8
|
|
|
|
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = icmp sgt <4 x i32> %vec.phi, %wide.load
|
|
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp sgt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: smax_i32_inloop:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB10_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB10_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: mov.w r0, #-2147483648
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB10_7
|
|
|
|
; CHECK-NEXT: .LBB10_3:
|
|
|
|
; CHECK-NEXT: mov.w r0, #-2147483648
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB10_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB10_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: subs r0, r3, #4
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov.w r0, #-2147483648
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: mov r2, r12
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB10_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: vmaxv.s32 r0, q0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB10_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: it eq
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: popeq {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB10_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r2], #4
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: csel r0, r0, r1, gt
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB10_8
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
2020-10-03 09:30:53 +08:00
|
|
|
%l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
|
2020-06-09 18:04:29 +08:00
|
|
|
%2 = icmp sgt i32 %vec.phi, %l5
|
|
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp sgt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: umin_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB11_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB11_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB11_7
|
|
|
|
; CHECK-NEXT: .LBB11_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
|
|
; CHECK-NEXT: b .LBB11_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB11_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmov.i8 q0, #0xff
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB11_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vmin.u32 q0, q0, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB11_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vminv.u32 r2, q0
|
|
|
|
; CHECK-NEXT: beq .LBB11_9
|
|
|
|
; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB11_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: cmp r2, r1
|
2020-07-14 17:04:55 +08:00
|
|
|
; CHECK-NEXT: csel r2, r2, r1, lo
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB11_8
|
|
|
|
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = icmp ult <4 x i32> %vec.phi, %wide.load
|
|
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp ult i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: umin_i32_inloop:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB12_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB12_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: mov.w r0, #-1
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: b .LBB12_7
|
|
|
|
; CHECK-NEXT: .LBB12_3:
|
|
|
|
; CHECK-NEXT: mov.w r0, #-1
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB12_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB12_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: subs r0, r3, #4
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov.w r0, #-1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: mov r2, r12
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB12_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: vminv.u32 r0, q0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB12_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: it eq
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: popeq {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB12_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r2], #4
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: csel r0, r0, r1, hi
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB12_8
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
2020-10-03 09:30:53 +08:00
|
|
|
%l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
|
2020-06-09 18:04:29 +08:00
|
|
|
%2 = icmp ult i32 %vec.phi, %l5
|
|
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp ugt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: umax_i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB13_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB13_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB13_7
|
|
|
|
; CHECK-NEXT: .LBB13_3:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB13_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB13_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r2, r0
|
|
|
|
; CHECK-NEXT: .LBB13_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
|
|
; CHECK-NEXT: vmax.u32 q0, q0, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB13_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: vmaxv.u32 r2, q0
|
|
|
|
; CHECK-NEXT: beq .LBB13_9
|
|
|
|
; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB13_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
|
|
; CHECK-NEXT: cmp r2, r1
|
2020-07-14 17:04:55 +08:00
|
|
|
; CHECK-NEXT: csel r2, r2, r1, hi
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB13_8
|
|
|
|
; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
|
|
%2 = icmp ugt <4 x i32> %vec.phi, %wide.load
|
|
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp ugt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: umax_i32_inloop:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r1, #1
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: blt .LBB14_3
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: bhs .LBB14_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: movs r0, #0
|
|
|
|
; CHECK-NEXT: b .LBB14_7
|
|
|
|
; CHECK-NEXT: .LBB14_3:
|
|
|
|
; CHECK-NEXT: movs r0, #0
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB14_9
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .LBB14_4: @ %vector.ph
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
|
|
; CHECK-NEXT: movs r2, #1
|
|
|
|
; CHECK-NEXT: subs r0, r3, #4
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r0, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: mov r2, r12
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .LBB14_5: @ %vector.body
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: vmaxv.u32 r0, q0
|
|
|
|
; CHECK-NEXT: le lr, .LBB14_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r3, r1
|
|
|
|
; CHECK-NEXT: it eq
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: popeq {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: .LBB14_8: @ %for.body
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r2], #4
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: csel r0, r0, r1, hi
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB14_8
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup
|
2020-09-23 18:43:27 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
2020-06-09 18:04:29 +08:00
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
2020-10-03 09:30:53 +08:00
|
|
|
%l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
|
2020-06-09 18:04:29 +08:00
|
|
|
%2 = icmp ugt i32 %vec.phi, %l5
|
|
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
|
|
%c = icmp ugt i32 %r.07, %6
|
|
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret i32 %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: fmin_f32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB15_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB15_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: vldr s0, .LCPI15_0
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB15_7
|
|
|
|
; CHECK-NEXT: .LBB15_3:
|
|
|
|
; CHECK-NEXT: vldr s0, .LCPI15_0
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB15_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB15_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
|
|
; CHECK-NEXT: movs r3, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r3, r0
|
|
|
|
; CHECK-NEXT: .LBB15_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
|
|
; CHECK-NEXT: vcmp.f32 lt, q0, q1
|
|
|
|
; CHECK-NEXT: vpsel q0, q0, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB15_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-09-12 21:08:07 +08:00
|
|
|
; CHECK-NEXT: vminnm.f32 s4, s2, s3
|
|
|
|
; CHECK-NEXT: vminnm.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vminnm.f32 s0, s0, s4
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r2, r1
|
|
|
|
; CHECK-NEXT: beq .LBB15_9
|
|
|
|
; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB15_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-12 21:08:07 +08:00
|
|
|
; CHECK-NEXT: vldmia r0!, {s2}
|
|
|
|
; CHECK-NEXT: vcmp.f32 s0, s2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
|
2020-09-12 21:08:07 +08:00
|
|
|
; CHECK-NEXT: vselge.f32 s0, s2, s0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB15_8
|
|
|
|
; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vmov r0, s0
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .p2align 2
|
|
|
|
; CHECK-NEXT: @ %bb.10:
|
|
|
|
; CHECK-NEXT: .LCPI15_0:
|
|
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
|
|
%2 = fcmp ult <4 x float> %vec.phi, %wide.load
|
|
|
|
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
|
|
%6 = load float, float* %arrayidx, align 4
|
|
|
|
%c = fcmp ult float %r.07, %6
|
|
|
|
%add = select i1 %c, float %r.07, float %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret float %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: fmax_f32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cmp r1, #1
|
|
|
|
; CHECK-NEXT: blt .LBB16_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r1, #4
|
|
|
|
; CHECK-NEXT: bhs .LBB16_4
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: vldr s0, .LCPI16_0
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: b .LBB16_7
|
|
|
|
; CHECK-NEXT: .LBB16_3:
|
|
|
|
; CHECK-NEXT: vldr s0, .LCPI16_0
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB16_9
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: .LBB16_4: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
|
|
; CHECK-NEXT: movs r3, #1
|
|
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: mov r3, r0
|
|
|
|
; CHECK-NEXT: .LBB16_5: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
|
|
; CHECK-NEXT: vcmp.f32 lt, q1, q0
|
|
|
|
; CHECK-NEXT: vpsel q0, q0, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB16_5
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
2020-09-12 21:08:07 +08:00
|
|
|
; CHECK-NEXT: vmaxnm.f32 s4, s2, s3
|
|
|
|
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vmaxnm.f32 s0, s0, s4
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: cmp r2, r1
|
|
|
|
; CHECK-NEXT: beq .LBB16_9
|
|
|
|
; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
|
|
; CHECK-NEXT: .LBB16_8: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-12 21:08:07 +08:00
|
|
|
; CHECK-NEXT: vldmia r0!, {s2}
|
|
|
|
; CHECK-NEXT: vcmp.f32 s2, s0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
|
2020-09-12 21:08:07 +08:00
|
|
|
; CHECK-NEXT: vselge.f32 s0, s2, s0
|
2020-06-09 18:04:29 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB16_8
|
|
|
|
; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vmov r0, s0
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .p2align 2
|
|
|
|
; CHECK-NEXT: @ %bb.10:
|
|
|
|
; CHECK-NEXT: .LCPI16_0:
|
|
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
|
|
entry:
|
|
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %n, -4
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
|
|
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
|
|
%2 = fcmp ugt <4 x float> %vec.phi, %wide.load
|
|
|
|
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
2020-10-03 09:30:53 +08:00
|
|
|
%5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
|
2020-06-09 18:04:29 +08:00
|
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
|
|
%6 = load float, float* %arrayidx, align 4
|
|
|
|
%c = fcmp ugt float %r.07, %6
|
|
|
|
%add = select i1 %c, float %r.07, float %6
|
|
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
|
|
ret float %r.0.lcssa
|
|
|
|
}
|
|
|
|
|
2020-08-08 00:16:56 +08:00
|
|
|
define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add4i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cbz r1, .LBB17_4
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.32 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB17_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddva.u32 r2, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB17_2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB17_4:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
|
|
|
%2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
|
2020-08-08 00:16:56 +08:00
|
|
|
%4 = add i32 %3, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %5, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ]
|
|
|
|
ret i32 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla4i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
|
|
|
; CHECK-NEXT: cbz r2, .LBB18_4
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: dlstp.32 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB18_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlava.u32 r12, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB18_2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: mov r0, r12
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB18_4:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: mov r0, r12
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
|
|
|
%2 = getelementptr inbounds i32, i32* %y, i32 %index
|
|
|
|
%3 = bitcast i32* %2 to <4 x i32>*
|
|
|
|
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
|
|
|
%4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
|
|
|
|
%5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
|
2020-08-08 00:16:56 +08:00
|
|
|
%7 = add i32 %6, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%8 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %8, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ]
|
|
|
|
ret i32 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add8i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r1, .LBB19_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.16 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB19_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddva.s16 r2, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB19_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB19_4:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
|
|
|
%1 = bitcast i16* %0 to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%2 = sext <8 x i16> %wide.masked.load to <8 x i32>
|
|
|
|
%3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
|
2020-08-08 00:16:56 +08:00
|
|
|
%5 = add i32 %4, %vec.phi
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
|
|
|
|
ret i32 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla8i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB20_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: dlstp.16 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB20_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlava.s16 r12, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB20_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB20_4:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: mov r0, r12
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
|
|
|
%1 = bitcast i16* %0 to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%2 = sext <8 x i16> %wide.masked.load to <8 x i32>
|
|
|
|
%3 = getelementptr inbounds i16, i16* %y, i32 %index
|
|
|
|
%4 = bitcast i16* %3 to <8 x i16>*
|
|
|
|
%wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
|
|
|
|
%6 = mul nsw <8 x i32> %5, %2
|
|
|
|
%7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
|
2020-08-08 00:16:56 +08:00
|
|
|
%9 = add i32 %8, %vec.phi
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
|
|
|
|
ret i32 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add16i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r1, .LBB21_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.8 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB21_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddva.u8 r2, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB21_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB21_4:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: mov r0, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
|
|
|
%1 = bitcast i8* %0 to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%2 = zext <16 x i8> %wide.masked.load to <16 x i32>
|
|
|
|
%3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
|
2020-08-08 00:16:56 +08:00
|
|
|
%5 = add i32 %4, %vec.phi
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%6 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
|
|
|
|
ret i32 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla16i32:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB22_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: dlstp.8 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB22_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlava.u8 r12, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB22_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: mov r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB22_4:
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: mov r0, r12
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
|
|
|
%1 = bitcast i8* %0 to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%2 = zext <16 x i8> %wide.masked.load to <16 x i32>
|
|
|
|
%3 = getelementptr inbounds i8, i8* %y, i32 %index
|
|
|
|
%4 = bitcast i8* %3 to <16 x i8>*
|
|
|
|
%wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
|
|
|
|
%6 = mul nuw nsw <16 x i32> %5, %2
|
|
|
|
%7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
|
2020-08-08 00:16:56 +08:00
|
|
|
%9 = add i32 %8, %vec.phi
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
|
|
|
|
ret i32 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add8i16:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r1, .LBB23_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.16 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB23_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddva.u16 r2, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB23_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: sxth r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB23_4:
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: sxth r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
|
|
|
%1 = bitcast i16* %0 to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
|
2020-08-08 00:16:56 +08:00
|
|
|
%4 = add i16 %3, %vec.phi
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %5, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ]
|
|
|
|
ret i16 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define signext i16 @mla8i16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla8i16:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB24_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: dlstp.16 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB24_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlava.u16 r12, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB24_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: sxth.w r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB24_4:
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: sxth.w r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp11.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
|
|
|
%1 = bitcast i16* %0 to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%2 = getelementptr inbounds i16, i16* %y, i32 %index
|
|
|
|
%3 = bitcast i16* %2 to <8 x i16>*
|
|
|
|
%wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
|
|
|
|
%5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
|
2020-08-08 00:16:56 +08:00
|
|
|
%7 = add i16 %6, %vec.phi
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%8 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %8, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ]
|
|
|
|
ret i16 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add16i16:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 18:09:49 +08:00
|
|
|
; CHECK-NEXT: cbz r1, .LBB25_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.8 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB25_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 18:09:49 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddva.u8 r2, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB25_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: sxth r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB25_4:
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: sxth r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
|
|
|
%1 = bitcast i8* %0 to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%2 = zext <16 x i8> %wide.masked.load to <16 x i16>
|
|
|
|
%3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
|
2020-08-08 00:16:56 +08:00
|
|
|
%5 = add i16 %4, %vec.phi
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%6 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ]
|
|
|
|
ret i16 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define signext i16 @mla16i16(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla16i16:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 18:09:49 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB26_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
2020-08-09 18:09:49 +08:00
|
|
|
; CHECK-NEXT: dlstp.8 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB26_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 18:09:49 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlava.u8 r12, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB26_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: sxth.w r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB26_4:
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: sxth.w r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp13.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
|
|
|
%1 = bitcast i8* %0 to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%2 = zext <16 x i8> %wide.masked.load to <16 x i16>
|
|
|
|
%3 = getelementptr inbounds i8, i8* %y, i32 %index
|
|
|
|
%4 = bitcast i8* %3 to <16 x i8>*
|
|
|
|
%wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
|
|
|
|
%6 = mul nuw <16 x i16> %5, %2
|
|
|
|
%7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
|
2020-08-08 00:16:56 +08:00
|
|
|
%9 = add i16 %8, %vec.phi
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ]
|
|
|
|
ret i16 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add16i8:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r1, .LBB27_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.8 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB27_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddva.u8 r2, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB27_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: uxtb r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB27_4:
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: uxtb r0, r2
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp7.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
|
|
|
%1 = bitcast i8* %0 to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
|
2020-08-08 00:16:56 +08:00
|
|
|
%4 = add i8 %3, %vec.phi
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%5 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %5, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ]
|
|
|
|
ret i8 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @mla16i8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla16i8:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB28_4
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: dlstp.8 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB28_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlava.u8 r12, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB28_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: uxtb.w r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
; CHECK-NEXT: .LBB28_4:
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: uxtb.w r0, r12
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp10.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
|
|
|
%1 = bitcast i8* %0 to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%2 = getelementptr inbounds i8, i8* %y, i32 %index
|
|
|
|
%3 = bitcast i8* %2 to <16 x i8>*
|
|
|
|
%wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
|
|
|
%4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
|
|
|
|
%5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
|
2020-08-08 00:16:56 +08:00
|
|
|
%7 = add i8 %6, %vec.phi
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%8 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %8, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ]
|
|
|
|
ret i8 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
|
|
|
|
; CHECK-LABEL: add4i64:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: cbz r1, .LBB29_3
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: mov r3, r2
|
2020-12-11 17:23:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.32 lr, r1
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB29_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vaddlva.s32 r2, r3, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB29_2
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB29_4
|
|
|
|
; CHECK-NEXT: .LBB29_3:
|
|
|
|
; CHECK-NEXT: movs r2, #0
|
|
|
|
; CHECK-NEXT: mov r3, r2
|
|
|
|
; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: mov r0, r2
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: mov r1, r3
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
|
|
|
%2 = sext <4 x i32> %wide.masked.load to <4 x i64>
|
|
|
|
%3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
|
2020-08-08 00:16:56 +08:00
|
|
|
%5 = add i64 %4, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%6 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ]
|
|
|
|
ret i64 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla4i64:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB30_3
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r3, r12
|
2020-12-11 17:23:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.32 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB30_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB30_2
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB30_4
|
|
|
|
; CHECK-NEXT: .LBB30_3:
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: mov r3, r12
|
|
|
|
; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: mov r0, r12
|
|
|
|
; CHECK-NEXT: mov r1, r3
|
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
|
|
entry:
|
|
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
|
|
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
|
|
|
%2 = sext <4 x i32> %wide.masked.load to <4 x i64>
|
|
|
|
%3 = getelementptr inbounds i32, i32* %y, i32 %index
|
|
|
|
%4 = bitcast i32* %3 to <4 x i32>*
|
|
|
|
%wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
|
|
|
%5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
|
|
|
|
%6 = mul nsw <4 x i64> %5, %2
|
|
|
|
%7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
|
2020-08-08 00:16:56 +08:00
|
|
|
%9 = add i64 %8, %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
|
|
|
|
ret i64 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
|
|
|
|
; CHECK-LABEL: mla8i64:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
|
|
; CHECK-NEXT: push {r7, lr}
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: cbz r2, .LBB31_3
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r3, r12
|
2020-12-11 17:23:57 +08:00
|
|
|
; CHECK-NEXT: dlstp.16 lr, r2
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: .LBB31_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0
|
|
|
|
; CHECK-NEXT: letp lr, .LBB31_2
|
2021-01-19 01:16:07 +08:00
|
|
|
; CHECK-NEXT: b .LBB31_4
|
|
|
|
; CHECK-NEXT: .LBB31_3:
|
|
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
|
|
; CHECK-NEXT: mov r3, r12
|
|
|
|
; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup
|
2020-08-08 00:16:56 +08:00
|
|
|
; CHECK-NEXT: mov r0, r12
|
|
|
|
; CHECK-NEXT: mov r1, r3
|
2020-08-09 17:57:17 +08:00
|
|
|
; CHECK-NEXT: pop {r7, pc}
|
2020-08-08 00:16:56 +08:00
|
|
|
entry:
|
|
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %n, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %n, -1
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
|
2020-08-25 20:53:26 +08:00
|
|
|
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
2020-08-08 00:16:56 +08:00
|
|
|
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
|
|
|
%1 = bitcast i16* %0 to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%2 = sext <8 x i16> %wide.masked.load to <8 x i64>
|
|
|
|
%3 = getelementptr inbounds i16, i16* %y, i32 %index
|
|
|
|
%4 = bitcast i16* %3 to <8 x i16>*
|
|
|
|
%wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
|
|
|
%5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
|
|
|
|
%6 = mul nsw <8 x i64> %5, %2
|
|
|
|
%7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
|
2020-10-03 09:30:53 +08:00
|
|
|
%8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
|
2020-08-08 00:16:56 +08:00
|
|
|
%9 = add i64 %8, %vec.phi
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
%s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
|
|
|
|
ret i64 %s.0.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
|
|
|
|
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
|
|
|
|
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
|
|
|
|
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
|
2020-10-03 09:30:53 +08:00
|
|
|
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
|
2020-08-08 00:16:56 +08:00
|
|
|
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
|
|
|
|
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
|
2020-10-03 09:30:53 +08:00
|
|
|
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
|
|
|
|
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
|
|
|
|
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
|
|
|
|
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
|
|
|
|
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
|
|
|
|
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
|
|
|
|
|
|
|
|
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
|
|
|
|
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
|
|
|
declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
|
|
|
declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
|
|
|
|
declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
|
|
|
|
declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
|