llvm-project/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2489 lines
100 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s
define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB0_7
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: .LBB0_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: le lr, .LBB0_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
; CHECK-NEXT: .LBB0_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: le lr, .LBB0_8
; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
%3 = add i32 %2, %vec.phi
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: mul_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB1_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB1_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_3: @ %vector.ph
; CHECK-NEXT: bic r12, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: sub.w r3, r12, #4
; CHECK-NEXT: add.w lr, r2, r3, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB1_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB1_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: vmov r2, s3
; CHECK-NEXT: cmp r12, r1
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: mul lr, r3, r2
; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: mul r2, r3, r2
; CHECK-NEXT: mul r2, r2, lr
; CHECK-NEXT: beq .LBB1_8
; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #2
; CHECK-NEXT: .LBB1_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: muls r2, r1, r2
; CHECK-NEXT: le lr, .LBB1_7
; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = mul <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = mul nsw i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: and_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB2_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB2_7
; CHECK-NEXT: .LBB2_3:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: b .LBB2_9
; CHECK-NEXT: .LBB2_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB2_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vand q0, q1, q0
; CHECK-NEXT: le lr, .LBB2_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: and.w r12, r12, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: and.w r2, r2, lr
; CHECK-NEXT: and.w r2, r2, r12
; CHECK-NEXT: beq .LBB2_9
; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB2_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: ands r2, r1
; CHECK-NEXT: le lr, .LBB2_8
; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = and <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = and i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: or_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB3_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB3_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB3_7
; CHECK-NEXT: .LBB3_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB3_9
; CHECK-NEXT: .LBB3_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB3_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vorr q0, q1, q0
; CHECK-NEXT: le lr, .LBB3_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: orr.w r12, r12, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: orr.w r2, r2, lr
; CHECK-NEXT: orr.w r2, r2, r12
; CHECK-NEXT: beq .LBB3_9
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB3_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: orrs r2, r1
; CHECK-NEXT: le lr, .LBB3_8
; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = or <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = or i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: xor_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB4_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB4_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB4_7
; CHECK-NEXT: .LBB4_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB4_9
; CHECK-NEXT: .LBB4_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB4_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: veor q0, q1, q0
; CHECK-NEXT: le lr, .LBB4_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: eor.w r12, r12, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: eor.w r2, r2, lr
; CHECK-NEXT: eor.w r2, r2, r12
; CHECK-NEXT: beq .LBB4_9
; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB4_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: eors r2, r1
; CHECK-NEXT: le lr, .LBB4_8
; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = xor <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = xor i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fadd_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB5_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB5_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI5_0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB5_7
; CHECK-NEXT: .LBB5_3:
; CHECK-NEXT: vldr s0, .LCPI5_0
; CHECK-NEXT: b .LBB5_9
; CHECK-NEXT: .LBB5_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: .LBB5_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vadd.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB5_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vadd.f32 s4, s2, s3
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: beq .LBB5_9
; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: .LBB5_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s2, [r0]
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB5_8
; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI5_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fadd fast <4 x float> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%5 = load float, float* %arrayidx, align 4
%add = fadd fast float %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fmul_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB6_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB6_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB6_7
; CHECK-NEXT: .LBB6_3:
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
; CHECK-NEXT: b .LBB6_9
; CHECK-NEXT: .LBB6_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.f32 q0, #1.000000e+00
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: .LBB6_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vmul.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB6_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmul.f32 s4, s2, s3
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vmul.f32 s0, s0, s1
; CHECK-NEXT: vmul.f32 s0, s0, s4
; CHECK-NEXT: beq .LBB6_9
; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: .LBB6_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s2, [r0]
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB6_8
; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fmul fast <4 x float> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%5 = load float, float* %arrayidx, align 4
%add = fmul fast float %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smin_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB7_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB7_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB7_7
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: b .LBB7_9
; CHECK-NEXT: .LBB7_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmvn.i32 q0, #0x80000000
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB7_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmin.s32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB7_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vminv.s32 r2, q0
; CHECK-NEXT: beq .LBB7_9
; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB7_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, lt
; CHECK-NEXT: le lr, .LBB7_8
; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp slt <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp slt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smin_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB8_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB8_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB8_7
; CHECK-NEXT: .LBB8_3:
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: b .LBB8_9
; CHECK-NEXT: .LBB8_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: .LBB8_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: vminv.s32 r0, q0
; CHECK-NEXT: le lr, .LBB8_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
; CHECK-NEXT: .LBB8_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, lt
; CHECK-NEXT: le lr, .LBB8_8
; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
%2 = icmp slt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp slt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smax_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB9_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB9_7
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: b .LBB9_9
; CHECK-NEXT: .LBB9_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x80000000
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmax.s32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB9_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmaxv.s32 r2, q0
; CHECK-NEXT: beq .LBB9_9
; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB9_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, gt
; CHECK-NEXT: le lr, .LBB9_8
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp sgt <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp sgt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smax_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB10_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: b .LBB10_9
; CHECK-NEXT: .LBB10_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: .LBB10_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: vmaxv.s32 r0, q0
; CHECK-NEXT: le lr, .LBB10_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
; CHECK-NEXT: .LBB10_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, gt
; CHECK-NEXT: le lr, .LBB10_8
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
%2 = icmp sgt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp sgt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umin_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB11_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB11_7
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: .LBB11_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB11_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmin.u32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vminv.u32 r2, q0
; CHECK-NEXT: beq .LBB11_9
; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB11_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, lo
; CHECK-NEXT: le lr, .LBB11_8
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp ult <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ult i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umin_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB12_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB12_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB12_7
; CHECK-NEXT: .LBB12_3:
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: b .LBB12_9
; CHECK-NEXT: .LBB12_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: .LBB12_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: vminv.u32 r0, q0
; CHECK-NEXT: le lr, .LBB12_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
; CHECK-NEXT: .LBB12_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, hi
; CHECK-NEXT: le lr, .LBB12_8
; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
%2 = icmp ult i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ugt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umax_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB13_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB13_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB13_7
; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB13_9
; CHECK-NEXT: .LBB13_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: .LBB13_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmax.u32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB13_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmaxv.u32 r2, q0
; CHECK-NEXT: beq .LBB13_9
; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB13_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, hi
; CHECK-NEXT: le lr, .LBB13_8
; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp ugt <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ugt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umax_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB14_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB14_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB14_7
; CHECK-NEXT: .LBB14_3:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB14_9
; CHECK-NEXT: .LBB14_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: .LBB14_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: vmaxv.u32 r0, q0
; CHECK-NEXT: le lr, .LBB14_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: add.w r2, r12, r3, lsl #2
; CHECK-NEXT: .LBB14_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, hi
; CHECK-NEXT: le lr, .LBB14_8
; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
%2 = icmp ugt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ugt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fmin_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB15_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB15_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI15_0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB15_7
; CHECK-NEXT: .LBB15_3:
; CHECK-NEXT: vldr s0, .LCPI15_0
; CHECK-NEXT: b .LBB15_9
; CHECK-NEXT: .LBB15_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: .LBB15_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vcmp.f32 lt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: le lr, .LBB15_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
[Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 21:08:07 +08:00
; CHECK-NEXT: vminnm.f32 s4, s2, s3
; CHECK-NEXT: vminnm.f32 s0, s0, s1
; CHECK-NEXT: vminnm.f32 s0, s0, s4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: beq .LBB15_9
; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: .LBB15_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 21:08:07 +08:00
; CHECK-NEXT: vldmia r0!, {s2}
; CHECK-NEXT: vcmp.f32 s0, s2
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
[Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 21:08:07 +08:00
; CHECK-NEXT: vselge.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB15_8
; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI15_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fcmp ult <4 x float> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%6 = load float, float* %arrayidx, align 4
%c = fcmp ult float %r.07, %6
%add = select i1 %c, float %r.07, float %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fmax_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB16_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB16_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI16_0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB16_7
; CHECK-NEXT: .LBB16_3:
; CHECK-NEXT: vldr s0, .LCPI16_0
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: .LBB16_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vcmp.f32 lt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: le lr, .LBB16_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
[Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 21:08:07 +08:00
; CHECK-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: beq .LBB16_9
; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
[Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 21:08:07 +08:00
; CHECK-NEXT: vldmia r0!, {s2}
; CHECK-NEXT: vcmp.f32 s2, s0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
[Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 21:08:07 +08:00
; CHECK-NEXT: vselge.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI16_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fcmp ugt <4 x float> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%6 = load float, float* %arrayidx, align 4
%c = fcmp ugt float %r.07, %6
%add = select i1 %c, float %r.07, float %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB17_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB17_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: letp lr, .LBB17_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB17_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
%3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
%4 = add i32 %3, %vec.phi
%index.next = add i32 %index, 4
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ]
ret i32 %s.0.lcssa
}
define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB18_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB18_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: letp lr, .LBB18_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB18_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp8.not = icmp eq i32 %n, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %y, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
%5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
%6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
%7 = add i32 %6, %vec.phi
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ]
ret i32 %s.0.lcssa
}
define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB19_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.16 lr, r1
; CHECK-NEXT: .LBB19_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: letp lr, .LBB19_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB19_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i16, i16* %x, i32 %index
%1 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = sext <8 x i16> %wide.masked.load to <8 x i32>
%3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
%4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
%5 = add i32 %4, %vec.phi
%index.next = add i32 %index, 8
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
ret i32 %s.0.lcssa
}
define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB20_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB20_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
; CHECK-NEXT: vmlava.s16 r12, q1, q0
; CHECK-NEXT: letp lr, .LBB20_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB20_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i32 %n, 0
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i16, i16* %x, i32 %index
%1 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = sext <8 x i16> %wide.masked.load to <8 x i32>
%3 = getelementptr inbounds i16, i16* %y, i32 %index
%4 = bitcast i16* %3 to <8 x i16>*
%wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
%6 = mul nsw <8 x i32> %5, %2
%7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
%8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
%9 = add i32 %8, %vec.phi
%index.next = add i32 %index, 8
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
ret i32 %s.0.lcssa
}
define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add16i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB21_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.8 lr, r1
; CHECK-NEXT: .LBB21_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: letp lr, .LBB21_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB21_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i8, i8* %x, i32 %index
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = zext <16 x i8> %wide.masked.load to <16 x i32>
%3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
%4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
%5 = add i32 %4, %vec.phi
%index.next = add i32 %index, 16
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
ret i32 %s.0.lcssa
}
define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla16i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB22_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.8 lr, r2
; CHECK-NEXT: .LBB22_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
; CHECK-NEXT: vmlava.u8 r12, q1, q0
; CHECK-NEXT: letp lr, .LBB22_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB22_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i32 %n, 0
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i8, i8* %x, i32 %index
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = zext <16 x i8> %wide.masked.load to <16 x i32>
%3 = getelementptr inbounds i8, i8* %y, i32 %index
%4 = bitcast i8* %3 to <16 x i8>*
%wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
%6 = mul nuw nsw <16 x i32> %5, %2
%7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
%9 = add i32 %8, %vec.phi
%index.next = add i32 %index, 16
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
ret i32 %s.0.lcssa
}
define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB23_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.16 lr, r1
; CHECK-NEXT: .LBB23_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: letp lr, .LBB23_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB23_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp8.not = icmp eq i32 %n, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i16, i16* %x, i32 %index
%1 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
%3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
%4 = add i16 %3, %vec.phi
%index.next = add i32 %index, 8
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ]
ret i16 %s.0.lcssa
}
define signext i16 @mla8i16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB24_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB24_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
; CHECK-NEXT: vmlava.u16 r12, q1, q0
; CHECK-NEXT: letp lr, .LBB24_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: sxth.w r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB24_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: sxth.w r0, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11.not = icmp eq i32 %n, 0
br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i16, i16* %x, i32 %index
%1 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = getelementptr inbounds i16, i16* %y, i32 %index
%3 = bitcast i16* %2 to <8 x i16>*
%wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
%5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
%6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
%7 = add i16 %6, %vec.phi
%index.next = add i32 %index, 8
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ]
ret i16 %s.0.lcssa
}
define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB25_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.8 lr, r1
; CHECK-NEXT: .LBB25_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: letp lr, .LBB25_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB25_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp8.not = icmp eq i32 %n, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i8, i8* %x, i32 %index
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = zext <16 x i8> %wide.masked.load to <16 x i16>
%3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
%4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
%5 = add i16 %4, %vec.phi
%index.next = add i32 %index, 16
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ]
ret i16 %s.0.lcssa
}
define signext i16 @mla16i16(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB26_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.8 lr, r2
; CHECK-NEXT: .LBB26_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
; CHECK-NEXT: vmlava.u8 r12, q1, q0
; CHECK-NEXT: letp lr, .LBB26_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: sxth.w r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB26_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: sxth.w r0, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp13.not = icmp eq i32 %n, 0
br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i8, i8* %x, i32 %index
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = zext <16 x i8> %wide.masked.load to <16 x i16>
%3 = getelementptr inbounds i8, i8* %y, i32 %index
%4 = bitcast i8* %3 to <16 x i8>*
%wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
%6 = mul nuw <16 x i16> %5, %2
%7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
%8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
%9 = add i16 %8, %vec.phi
%index.next = add i32 %index, 16
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ]
ret i16 %s.0.lcssa
}
define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB27_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.8 lr, r1
; CHECK-NEXT: .LBB27_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: letp lr, .LBB27_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: uxtb r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB27_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: uxtb r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp7.not = icmp eq i32 %n, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i8, i8* %x, i32 %index
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
%3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
%4 = add i8 %3, %vec.phi
%index.next = add i32 %index, 16
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ]
ret i8 %s.0.lcssa
}
define zeroext i8 @mla16i8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB28_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.8 lr, r2
; CHECK-NEXT: .LBB28_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
; CHECK-NEXT: vmlava.u8 r12, q1, q0
; CHECK-NEXT: letp lr, .LBB28_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: uxtb.w r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB28_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: uxtb.w r0, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp10.not = icmp eq i32 %n, 0
br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i8, i8* %x, i32 %index
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = getelementptr inbounds i8, i8* %y, i32 %index
%3 = bitcast i8* %2 to <16 x i8>*
%wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
%5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
%6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
%7 = add i8 %6, %vec.phi
%index.next = add i32 %index, 16
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ]
ret i8 %s.0.lcssa
}
define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add4i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB29_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB29_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddlva.s32 r2, r3, q0
; CHECK-NEXT: letp lr, .LBB29_2
; CHECK-NEXT: b .LBB29_4
; CHECK-NEXT: .LBB29_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = sext <4 x i32> %wide.masked.load to <4 x i64>
%3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
%4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
%5 = add i64 %4, %vec.phi
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ]
ret i64 %s.0.lcssa
}
define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla4i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB30_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB30_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0
; CHECK-NEXT: letp lr, .LBB30_2
; CHECK-NEXT: b .LBB30_4
; CHECK-NEXT: .LBB30_3:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i32 %n, 0
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = sext <4 x i32> %wide.masked.load to <4 x i64>
%3 = getelementptr inbounds i32, i32* %y, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
%wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
%6 = mul nsw <4 x i64> %5, %2
%7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
%8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
%9 = add i64 %8, %vec.phi
%index.next = add i32 %index, 4
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
ret i64 %s.0.lcssa
}
define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
; CHECK-LABEL: mla8i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB31_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB31_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0
; CHECK-NEXT: letp lr, .LBB31_2
; CHECK-NEXT: b .LBB31_4
; CHECK-NEXT: .LBB31_3:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i32 %n, 0
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
%0 = getelementptr inbounds i16, i16* %x, i32 %index
%1 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = sext <8 x i16> %wide.masked.load to <8 x i64>
%3 = getelementptr inbounds i16, i16* %y, i32 %index
%4 = bitcast i16* %3 to <8 x i16>*
%wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
%6 = mul nsw <8 x i64> %5, %2
%7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
%8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
%9 = add i64 %8, %vec.phi
%index.next = add i32 %index, 8
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
ret i64 %s.0.lcssa
}
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)