llvm-project/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

786 lines
38 KiB
LLVM
Raw Normal View History

2020-04-22 19:40:45 +08:00
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s
2020-04-22 19:40:45 +08:00
define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmas1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vfmas.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
%6 = getelementptr inbounds float, float* %z, i32 %index
%7 = bitcast float* %6 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmas2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB1_1: @ %vector.ph
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vmov r12, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfmas.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
%6 = fadd fast <4 x float> %5, %broadcast.splat14
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fma1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB2_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB2_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
%6 = getelementptr inbounds float, float* %z, i32 %index
%7 = bitcast float* %6 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fma2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB3_1: @ %vector.ph
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vmov r12, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
%4 = getelementptr inbounds float, float* %y, i32 %index
%5 = bitcast float* %4 to <4 x float>*
%wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
%6 = fadd fast <4 x float> %3, %wide.masked.load14
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmss1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB4_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: eor r12, r12, #-2147483648
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vfmas.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB4_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%fneg = fneg fast float %a
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
%6 = getelementptr inbounds float, float* %z, i32 %index
%7 = bitcast float* %6 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmss2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB5_1: @ %vector.ph
; CHECK-NEXT: vmov lr, s0
; CHECK-NEXT: vdup.32 q0, lr
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vfma.f32 q3, q2, q1
; CHECK-NEXT: vstrw.32 q3, [r2], #16
; CHECK-NEXT: letp lr, .LBB5_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
%6 = fsub fast <4 x float> %5, %broadcast.splat14
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmss3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB6_1: @ %vector.ph
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB6_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vfms.f32 q3, q2, q1
; CHECK-NEXT: vstrw.32 q3, [r2], #16
; CHECK-NEXT: letp lr, .LBB6_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = fneg fast <4 x float> %wide.masked.load12
%6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %5, <4 x float> %broadcast.splat14)
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmss4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB7_1: @ %vector.ph
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB7_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vfms.f32 q3, q2, q1
; CHECK-NEXT: vstrw.32 q3, [r2], #16
; CHECK-NEXT: letp lr, .LBB7_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
%6 = fsub fast <4 x float> %broadcast.splat14, %5
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fms1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB8_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: eor r12, r12, #-2147483648
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB8_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB8_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%fneg = fneg fast float %a
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
%6 = getelementptr inbounds float, float* %z, i32 %index
%7 = bitcast float* %6 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fms2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB9_1: @ %vector.ph
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB9_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vfms.f32 q2, q1, q0
; CHECK-NEXT: vstrw.32 q2, [r2], #16
; CHECK-NEXT: letp lr, .LBB9_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14
%6 = fsub fast <4 x float> %wide.masked.load12, %5
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fms3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB10_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB10_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vneg.f32 q1, q1
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB10_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
%5 = fneg fast <4 x float> %wide.masked.load12
%6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %5)
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fms4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB11_1: @ %vector.ph
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: vmov r12, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB11_2: @ %vector.body
2020-04-22 19:40:45 +08:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vneg.f32 q1, q1
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: letp lr, .LBB11_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
2020-04-22 19:40:45 +08:00
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2020-04-22 19:40:45 +08:00
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
%4 = getelementptr inbounds float, float* %y, i32 %index
%5 = bitcast float* %4 to <4 x float>*
%wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
%6 = fsub fast <4 x float> %3, %wide.masked.load14
%7 = getelementptr inbounds float, float* %z, i32 %index
%8 = bitcast float* %7 to <4 x float>*
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)