2020-11-02 00:24:23 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
|
|
|
|
|
|
|
|
%struct.DCT_InstanceTypeDef = type { float*, i32, i32 }
|
|
|
|
|
|
|
|
define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve1:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
|
|
|
|
; CHECK-NEXT: ldr r3, [r0, #4]
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
|
|
|
; CHECK-NEXT: cmp.w r12, #2
|
|
|
|
; CHECK-NEXT: blo .LBB0_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r5, [r0, #8]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r3, [r0]
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r0, r5, #3
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
|
|
; CHECK-NEXT: add.w r4, r3, r5, lsl #2
|
|
|
|
; CHECK-NEXT: subs r3, r0, #4
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: movs r0, #1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: lsl.w r9, r5, #2
|
|
|
|
; CHECK-NEXT: add.w r8, r0, r3, lsr #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: .LBB0_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: dls lr, r8
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r7, r1
|
|
|
|
; CHECK-NEXT: mov r3, r4
|
|
|
|
; CHECK-NEXT: mov r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: .LBB0_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vctp.32 r6
|
|
|
|
; CHECK-NEXT: subs r6, #4
|
|
|
|
; CHECK-NEXT: vpsttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r7], #16
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q2, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q0, q2, q1
|
|
|
|
; CHECK-NEXT: le lr, .LBB0_3
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s2, s3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r3, r2, r0, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: adds r0, #1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r4, r9
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: cmp r0, r12
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vstr s0, [r3]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: bne .LBB0_2
|
|
|
|
; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -1
|
|
|
|
%cmp350 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.051, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi
|
|
|
|
%10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%11 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %11, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10)
|
|
|
|
%arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051
|
|
|
|
store float %12, float* %arrayidx14, align 4
|
|
|
|
%add16 = add nuw i32 %k2.051, 1
|
|
|
|
%exitcond52.not = icmp eq i32 %add16, %sub
|
|
|
|
br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve2:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #2
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo .LBB1_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: ldr.w r12, [r0, #8]
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: ldr r3, [r0]
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r12, #3
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
|
|
; CHECK-NEXT: add.w r6, r3, r12, lsl #2
|
|
|
|
; CHECK-NEXT: subs r0, #4
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: add.w r7, r3, r12, lsl #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: lsl.w r10, r12, #3
|
|
|
|
; CHECK-NEXT: add.w r8, r4, r0, lsr #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: .LBB1_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: dls lr, r8
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r11, r4, #1
|
|
|
|
; CHECK-NEXT: mov r3, r6
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: mov r0, r7
|
|
|
|
; CHECK-NEXT: vmov q1, q0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r9, r12
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: .LBB1_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vctp.32 r9
|
|
|
|
; CHECK-NEXT: sub.w r9, r9, #4
|
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q2, [r5], #16
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q3, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q3, q2
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q3, [r0], #16
|
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vfmat.f32 q0, q3, q2
|
|
|
|
; CHECK-NEXT: le lr, .LBB1_3
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s8, s2, s3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r2, r11, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r6, r10
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s2, s6, s7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r7, r10
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s8
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s4, s2
|
|
|
|
; CHECK-NEXT: vstr s0, [r0]
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r4, lsl #2
|
|
|
|
; CHECK-NEXT: adds r4, #2
|
|
|
|
; CHECK-NEXT: cmp r4, r1
|
|
|
|
; CHECK-NEXT: vstr s2, [r0]
|
|
|
|
; CHECK-NEXT: blo .LBB1_2
|
|
|
|
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -2
|
|
|
|
%cmp371 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.072, %0
|
|
|
|
%add = add nuw i32 %k2.072, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ]
|
|
|
|
%vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi73
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi
|
|
|
|
%15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi
|
|
|
|
%16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%17 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %17, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16)
|
|
|
|
%19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15)
|
|
|
|
%arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072
|
|
|
|
store float %18, float* %arrayidx21, align 4
|
|
|
|
%arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %19, float* %arrayidx23, align 4
|
|
|
|
%add25 = add i32 %k2.072, 2
|
|
|
|
%cmp3 = icmp ult i32 %add25, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve3:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9}
|
|
|
|
; CHECK-NEXT: .pad #16
|
|
|
|
; CHECK-NEXT: sub sp, #16
|
|
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #3
|
|
|
|
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo .LBB2_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r7, [r0, #8]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: movs r5, #1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r3, [r0]
|
|
|
|
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: add.w r0, r7, r7, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r12, r3, r7, lsl #2
|
|
|
|
; CHECK-NEXT: add.w r1, r3, r7, lsl #3
|
|
|
|
; CHECK-NEXT: add.w r8, r3, r0, lsl #2
|
|
|
|
; CHECK-NEXT: adds r3, r7, #3
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: bic r3, r3, #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: lsls r7, r0, #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: subs r3, #4
|
|
|
|
; CHECK-NEXT: add.w r3, r5, r3, lsr #2
|
|
|
|
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB2_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
|
|
; CHECK-NEXT: add.w r9, r5, #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r11, r5, #1
|
|
|
|
; CHECK-NEXT: dls lr, r0
|
|
|
|
; CHECK-NEXT: mov r3, r12
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r0, r1
|
|
|
|
; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: mov r4, r8
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q2, q0
|
|
|
|
; CHECK-NEXT: vmov q1, q0
|
|
|
|
; CHECK-NEXT: .LBB2_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vctp.32 r10
|
|
|
|
; CHECK-NEXT: sub.w r10, r10, #4
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q3, [r6], #16
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q4, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q4, q3
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
|
|
|
|
; CHECK-NEXT: vpsttt
|
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q4, q3
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q4, [r4], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q0, q4, q3
|
|
|
|
; CHECK-NEXT: le lr, .LBB2_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s10, s11
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r0, r2, r11, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r12, r7
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s10, s6, s7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r1, r7
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r8, r7
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s6, s2, s3
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s8, s12
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s10
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
|
|
; CHECK-NEXT: vstr s2, [r0]
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r5, lsl #2
|
|
|
|
; CHECK-NEXT: adds r5, #3
|
|
|
|
; CHECK-NEXT: vstr s4, [r0]
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r9, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s0, [r0]
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r5, r0
|
|
|
|
; CHECK-NEXT: blo .LBB2_2
|
|
|
|
; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #16
|
|
|
|
; CHECK-NEXT: vpop {d8, d9}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -3
|
|
|
|
%cmp392 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.093, %0
|
|
|
|
%add = add nuw i32 %k2.093, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
%add6 = add i32 %k2.093, 2
|
|
|
|
%mul7 = mul i32 %add6, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ]
|
|
|
|
%vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ]
|
|
|
|
%vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi95
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi94
|
|
|
|
%15 = add i32 %index, %mul7
|
|
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
|
|
%wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
|
|
|
|
%19 = fadd fast <4 x float> %18, %vec.phi
|
|
|
|
%20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi
|
|
|
|
%21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94
|
|
|
|
%22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%23 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %23, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22)
|
|
|
|
%25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21)
|
|
|
|
%26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20)
|
|
|
|
%arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093
|
|
|
|
store float %24, float* %arrayidx28, align 4
|
|
|
|
%arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %25, float* %arrayidx30, align 4
|
|
|
|
%arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
|
|
store float %26, float* %arrayidx32, align 4
|
|
|
|
%add34 = add i32 %k2.093, 3
|
|
|
|
%cmp3 = icmp ult i32 %add34, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve4:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
|
|
|
; CHECK-NEXT: .pad #32
|
|
|
|
; CHECK-NEXT: sub sp, #32
|
|
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #4
|
|
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo.w .LBB3_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
|
|
; CHECK-NEXT: movs r6, #1
|
|
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
|
|
; CHECK-NEXT: add.w r0, r3, r3, lsl #1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r8, r1, r3, lsl #2
|
|
|
|
; CHECK-NEXT: add.w r12, r1, r3, lsl #3
|
|
|
|
; CHECK-NEXT: add.w r10, r1, r3, lsl #4
|
|
|
|
; CHECK-NEXT: add.w r9, r1, r0, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
|
|
; CHECK-NEXT: lsls r7, r3, #4
|
|
|
|
; CHECK-NEXT: subs r0, #4
|
|
|
|
; CHECK-NEXT: add.w r0, r6, r0, lsr #2
|
|
|
|
; CHECK-NEXT: strd r0, r3, [sp, #4] @ 8-byte Folded Spill
|
|
|
|
; CHECK-NEXT: .LBB3_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r3, r8
|
|
|
|
; CHECK-NEXT: mov r5, r9
|
|
|
|
; CHECK-NEXT: dls lr, r0
|
|
|
|
; CHECK-NEXT: adds r0, r6, #3
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r0, r6, #2
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r4, r10
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr.w r11, [sp, #8] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vmov q1, q0
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r0, r6, #1
|
|
|
|
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r0, r12
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q2, q0
|
|
|
|
; CHECK-NEXT: vmov q3, q0
|
|
|
|
; CHECK-NEXT: .LBB3_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
|
|
; CHECK-NEXT: vctp.32 r11
|
|
|
|
; CHECK-NEXT: sub.w r11, r11, #4
|
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q4, [r1], #16
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q5, [r0], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q3, q5, q4
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q5, [r3], #16
|
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q5, q4
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q5, [r5], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q5, q4
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q5, [r4], #16
|
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vfmat.f32 q0, q5, q4
|
|
|
|
; CHECK-NEXT: le lr, .LBB3_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s16, s14, s15
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r8, r7
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s14, s10, s11
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r12, r7
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r0, lsl #2
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s6, s7
|
|
|
|
; CHECK-NEXT: add r9, r7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
|
|
; CHECK-NEXT: add r10, r7
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s6, s2, s3
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s12, s16
|
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s14
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s10
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
|
|
; CHECK-NEXT: vstr s2, [r0]
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r6, lsl #2
|
|
|
|
; CHECK-NEXT: adds r6, #4
|
|
|
|
; CHECK-NEXT: vstr s8, [r0]
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r0, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s4, [r0]
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r0, r2, r0, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s0, [r0]
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r6, r0
|
|
|
|
; CHECK-NEXT: blo .LBB3_2
|
|
|
|
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #32
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -4
|
|
|
|
%cmp3113 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.0114, %0
|
|
|
|
%add = add nuw nsw i32 %k2.0114, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
%add6 = add nuw nsw i32 %k2.0114, 2
|
|
|
|
%mul7 = mul i32 %add6, %0
|
|
|
|
%add8 = add i32 %k2.0114, 3
|
|
|
|
%mul9 = mul i32 %add8, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ]
|
|
|
|
%vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ]
|
|
|
|
%vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ]
|
|
|
|
%vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi116
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi117
|
|
|
|
%15 = add i32 %index, %mul7
|
|
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
|
|
%wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
|
|
|
|
%19 = fadd fast <4 x float> %18, %vec.phi115
|
|
|
|
%20 = add i32 %index, %mul9
|
|
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
|
|
%wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
|
|
|
|
%24 = fadd fast <4 x float> %23, %vec.phi
|
|
|
|
%25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi
|
|
|
|
%26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115
|
|
|
|
%27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116
|
|
|
|
%28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%29 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %29, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28)
|
|
|
|
%31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27)
|
|
|
|
%32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26)
|
|
|
|
%33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25)
|
|
|
|
%arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114
|
|
|
|
store float %31, float* %arrayidx35, align 4
|
|
|
|
%arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %30, float* %arrayidx37, align 4
|
|
|
|
%arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
|
|
store float %32, float* %arrayidx39, align 4
|
|
|
|
%arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
|
|
store float %33, float* %arrayidx41, align 4
|
|
|
|
%add43 = add i32 %k2.0114, 4
|
|
|
|
%cmp3 = icmp ult i32 %add43, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve5:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: .pad #32
|
|
|
|
; CHECK-NEXT: sub sp, #32
|
|
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #5
|
|
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo.w .LBB4_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
|
|
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r8, r1, r3, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
|
|
; CHECK-NEXT: movs r0, #1
|
|
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
|
|
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: add.w r1, r3, r3, lsl #2
|
|
|
|
; CHECK-NEXT: lsls r1, r1, #2
|
|
|
|
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB4_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q1, #0x0
|
|
|
|
; CHECK-NEXT: add.w r10, r0, #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r7, r0, #1
|
|
|
|
; CHECK-NEXT: dls lr, r1
|
|
|
|
; CHECK-NEXT: adds r1, r0, #4
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #3
|
|
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r3, r8
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q0, q1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr.w r11, [sp, #12] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q3, q1
|
|
|
|
; CHECK-NEXT: vmov q2, q1
|
|
|
|
; CHECK-NEXT: vmov q4, q1
|
|
|
|
; CHECK-NEXT: .LBB4_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r9, r3, r5
|
|
|
|
; CHECK-NEXT: vctp.32 r11
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpsttt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q5, [r1], #16
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q6, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q3, q6, q5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r12, r9, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q6, [r9]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q4, q6, q5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: sub.w r11, r11, #4
|
|
|
|
; CHECK-NEXT: add.w r4, r12, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q6, [r12]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q6, q5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r6, r4, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q6, [r4]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q0, q6, q5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q6, [r6]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q6, q5
|
|
|
|
; CHECK-NEXT: le lr, .LBB4_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s20, s18, s19
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r1, r2, r7, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s16, s16, s17
|
|
|
|
; CHECK-NEXT: vadd.f32 s18, s14, s15
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
|
|
; CHECK-NEXT: vadd.f32 s14, s6, s7
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s10, s11
|
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s2, s3
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s16, s20
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s18
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s14
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s8, s6
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s10
|
|
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
|
|
; CHECK-NEXT: adds r0, #5
|
|
|
|
; CHECK-NEXT: vstr s12, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r10, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r8, r1
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: blo.w .LBB4_2
|
|
|
|
; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #32
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -5
|
|
|
|
%cmp3134 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.0135, %0
|
|
|
|
%add = add nuw i32 %k2.0135, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
%add6 = add i32 %k2.0135, 2
|
|
|
|
%mul7 = mul i32 %add6, %0
|
|
|
|
%add8 = add i32 %k2.0135, 3
|
|
|
|
%mul9 = mul i32 %add8, %0
|
|
|
|
%add10 = add i32 %k2.0135, 4
|
|
|
|
%mul11 = mul i32 %add10, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ]
|
|
|
|
%vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ]
|
|
|
|
%vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ]
|
|
|
|
%vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ]
|
|
|
|
%vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi137
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi139
|
|
|
|
%15 = add i32 %index, %mul7
|
|
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
|
|
%wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
|
|
|
|
%19 = fadd fast <4 x float> %18, %vec.phi138
|
|
|
|
%20 = add i32 %index, %mul9
|
|
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
|
|
%wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
|
|
|
|
%24 = fadd fast <4 x float> %23, %vec.phi136
|
|
|
|
%25 = add i32 %index, %mul11
|
|
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
|
|
%wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
|
|
|
|
%29 = fadd fast <4 x float> %28, %vec.phi
|
|
|
|
%30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi
|
|
|
|
%31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136
|
|
|
|
%32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137
|
|
|
|
%33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138
|
|
|
|
%34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%35 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %35, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34)
|
|
|
|
%37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33)
|
|
|
|
%38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32)
|
|
|
|
%39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31)
|
|
|
|
%40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30)
|
|
|
|
%arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135
|
|
|
|
store float %38, float* %arrayidx42, align 4
|
|
|
|
%arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %36, float* %arrayidx44, align 4
|
|
|
|
%arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
|
|
store float %37, float* %arrayidx46, align 4
|
|
|
|
%arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
|
|
store float %39, float* %arrayidx48, align 4
|
|
|
|
%arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
|
|
store float %40, float* %arrayidx50, align 4
|
|
|
|
%add52 = add i32 %k2.0135, 5
|
|
|
|
%cmp3 = icmp ult i32 %add52, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve6:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: .pad #32
|
|
|
|
; CHECK-NEXT: sub sp, #32
|
|
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #6
|
|
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo.w .LBB5_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
|
|
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
|
|
; CHECK-NEXT: movs r0, #1
|
|
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
|
|
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: add.w r1, r3, r3, lsl #1
|
|
|
|
; CHECK-NEXT: lsls r1, r1, #3
|
|
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB5_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q1, #0x0
|
|
|
|
; CHECK-NEXT: add.w r11, r0, #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r4, r0, #1
|
|
|
|
; CHECK-NEXT: dls lr, r1
|
|
|
|
; CHECK-NEXT: adds r1, r0, #5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #3
|
|
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r3, r9
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: vmov q3, q1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr.w r8, [sp, #8] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q4, q1
|
|
|
|
; CHECK-NEXT: vmov q0, q1
|
|
|
|
; CHECK-NEXT: vmov q5, q1
|
|
|
|
; CHECK-NEXT: vmov q2, q1
|
|
|
|
; CHECK-NEXT: .LBB5_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r12, r3, r5
|
|
|
|
; CHECK-NEXT: vctp.32 r8
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpsttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q6, [r1], #16
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q4, q7, q6
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r10, r12, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r12]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q5, q7, q6
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r6, r10, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r10]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q7, q6
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: sub.w r8, r8, #4
|
|
|
|
; CHECK-NEXT: adds r7, r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r6]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q0, q7, q6
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r6, r7, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r7]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q3, q7, q6
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r6]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q7, q6
|
|
|
|
; CHECK-NEXT: le lr, .LBB5_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s24, s22, s23
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s20, s20, s21
|
|
|
|
; CHECK-NEXT: vadd.f32 s22, s18, s19
|
|
|
|
; CHECK-NEXT: vadd.f32 s16, s16, s17
|
|
|
|
; CHECK-NEXT: vadd.f32 s18, s6, s7
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s14, s15
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
|
|
; CHECK-NEXT: vadd.f32 s14, s10, s11
|
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s2, s3
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s20, s24
|
|
|
|
; CHECK-NEXT: vadd.f32 s1, s16, s22
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s12, s6
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s18
|
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s14
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s10
|
|
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
|
|
; CHECK-NEXT: adds r0, #6
|
|
|
|
; CHECK-NEXT: vstr s1, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r11, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s8, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r9, r1
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: blo.w .LBB5_2
|
|
|
|
; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #32
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -6
|
|
|
|
%cmp3155 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.0156, %0
|
|
|
|
%add = add nuw i32 %k2.0156, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
%add6 = add i32 %k2.0156, 2
|
|
|
|
%mul7 = mul i32 %add6, %0
|
|
|
|
%add8 = add i32 %k2.0156, 3
|
|
|
|
%mul9 = mul i32 %add8, %0
|
|
|
|
%add10 = add i32 %k2.0156, 4
|
|
|
|
%mul11 = mul i32 %add10, %0
|
|
|
|
%add12 = add i32 %k2.0156, 5
|
|
|
|
%mul13 = mul i32 %add12, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ]
|
|
|
|
%vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ]
|
|
|
|
%vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ]
|
|
|
|
%vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ]
|
|
|
|
%vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ]
|
|
|
|
%vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi158
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi160
|
|
|
|
%15 = add i32 %index, %mul7
|
|
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
|
|
%wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
|
|
|
|
%19 = fadd fast <4 x float> %18, %vec.phi161
|
|
|
|
%20 = add i32 %index, %mul9
|
|
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
|
|
%wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
|
|
|
|
%24 = fadd fast <4 x float> %23, %vec.phi159
|
|
|
|
%25 = add i32 %index, %mul11
|
|
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
|
|
%wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
|
|
|
|
%29 = fadd fast <4 x float> %28, %vec.phi157
|
|
|
|
%30 = add i32 %index, %mul13
|
|
|
|
%31 = getelementptr inbounds float, float* %2, i32 %30
|
|
|
|
%32 = bitcast float* %31 to <4 x float>*
|
|
|
|
%wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
|
|
|
|
%34 = fadd fast <4 x float> %33, %vec.phi
|
|
|
|
%35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi
|
|
|
|
%36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157
|
|
|
|
%37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158
|
|
|
|
%38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159
|
|
|
|
%39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160
|
|
|
|
%40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%41 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %41, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
|
|
|
|
%43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39)
|
|
|
|
%44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38)
|
|
|
|
%45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37)
|
|
|
|
%46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36)
|
|
|
|
%47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35)
|
|
|
|
%arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156
|
|
|
|
store float %45, float* %arrayidx49, align 4
|
|
|
|
%arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %43, float* %arrayidx51, align 4
|
|
|
|
%arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
|
|
store float %42, float* %arrayidx53, align 4
|
|
|
|
%arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
|
|
store float %44, float* %arrayidx55, align 4
|
|
|
|
%arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
|
|
store float %46, float* %arrayidx57, align 4
|
|
|
|
%arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12
|
|
|
|
store float %47, float* %arrayidx59, align 4
|
|
|
|
%add61 = add i32 %k2.0156, 6
|
|
|
|
%cmp3 = icmp ult i32 %add61, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: .pad #88
|
|
|
|
; CHECK-NEXT: sub sp, #88
|
|
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #7
|
|
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo.w .LBB6_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
|
|
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
|
|
; CHECK-NEXT: add.w r12, r1, r3, lsl #2
|
|
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
|
|
; CHECK-NEXT: movs r0, #1
|
|
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: rsb r1, r3, r3, lsl #3
|
|
|
|
; CHECK-NEXT: lsls r1, r1, #2
|
|
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB6_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov.i32 q2, #0x0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r4, r0, #2
|
|
|
|
; CHECK-NEXT: add.w r8, r0, #1
|
|
|
|
; CHECK-NEXT: dls lr, r1
|
|
|
|
; CHECK-NEXT: adds r1, r0, #6
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #5
|
|
|
|
; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
|
|
; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #3
|
|
|
|
; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: mov r3, r12
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q4, q2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q5, q2
|
|
|
|
; CHECK-NEXT: vmov q3, q2
|
|
|
|
; CHECK-NEXT: vmov q6, q2
|
|
|
|
; CHECK-NEXT: vmov q1, q2
|
|
|
|
; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB6_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
|
|
; CHECK-NEXT: add.w r10, r3, r5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vctp.32 r1
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpsttt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r9], #16
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q5, q0, q7
|
|
|
|
; CHECK-NEXT: add.w r11, r10, r5
|
|
|
|
; CHECK-NEXT: vpstt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r10]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q6, q0, q7
|
|
|
|
; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
|
|
|
|
; CHECK-NEXT: vpstt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r11]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q0, q7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r6, r11, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q6, q5
|
|
|
|
; CHECK-NEXT: vmov q5, q4
|
|
|
|
; CHECK-NEXT: vmov q4, q2
|
|
|
|
; CHECK-NEXT: vmov q2, q3
|
|
|
|
; CHECK-NEXT: vpst
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r6]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q3, q1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r7, r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vfmat.f32 q1, q0, q7
|
|
|
|
; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
|
|
|
|
; CHECK-NEXT: vmov q1, q3
|
|
|
|
; CHECK-NEXT: vmov q3, q2
|
|
|
|
; CHECK-NEXT: vmov q2, q4
|
|
|
|
; CHECK-NEXT: vmov q4, q5
|
|
|
|
; CHECK-NEXT: vmov q5, q6
|
|
|
|
; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: subs r1, #4
|
|
|
|
; CHECK-NEXT: adds r6, r7, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r7]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q3, q0, q7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r7, r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstttt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r6]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q4, q0, q7
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r7]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q0, q7
|
|
|
|
; CHECK-NEXT: le lr, .LBB6_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s26, s27
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s24, s25
|
|
|
|
; CHECK-NEXT: vadd.f32 s3, s20, s21
|
|
|
|
; CHECK-NEXT: vadd.f32 s1, s22, s23
|
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
|
|
; CHECK-NEXT: vadd.f32 s20, s10, s11
|
|
|
|
; CHECK-NEXT: vadd.f32 s11, s14, s15
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
|
|
; CHECK-NEXT: vadd.f32 s14, s6, s7
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s18, s19
|
|
|
|
; CHECK-NEXT: vadd.f32 s9, s16, s17
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s3, s1
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s18, s19
|
|
|
|
; CHECK-NEXT: vadd.f32 s5, s16, s17
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s14
|
|
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s11
|
|
|
|
; CHECK-NEXT: adds r0, #7
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s9, s10
|
|
|
|
; CHECK-NEXT: vstr s2, [r1]
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s20
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s5, s6
|
|
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s12, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s10, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s8, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add r12, r1
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: blo.w .LBB6_2
|
|
|
|
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #88
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -7
|
|
|
|
%cmp3176 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.0177, %0
|
|
|
|
%add = add nuw i32 %k2.0177, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
%add6 = add i32 %k2.0177, 2
|
|
|
|
%mul7 = mul i32 %add6, %0
|
|
|
|
%add8 = add i32 %k2.0177, 3
|
|
|
|
%mul9 = mul i32 %add8, %0
|
|
|
|
%add10 = add i32 %k2.0177, 4
|
|
|
|
%mul11 = mul i32 %add10, %0
|
|
|
|
%add12 = add i32 %k2.0177, 5
|
|
|
|
%mul13 = mul i32 %add12, %0
|
|
|
|
%add14 = add i32 %k2.0177, 6
|
|
|
|
%mul15 = mul i32 %add14, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
|
|
|
|
%vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ]
|
|
|
|
%vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ]
|
|
|
|
%vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ]
|
|
|
|
%vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ]
|
|
|
|
%vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
|
|
|
|
%vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi179
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi181
|
|
|
|
%15 = add i32 %index, %mul7
|
|
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
|
|
%wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
|
|
|
|
%19 = fadd fast <4 x float> %18, %vec.phi183
|
|
|
|
%20 = add i32 %index, %mul9
|
|
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
|
|
%wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
|
|
|
|
%24 = fadd fast <4 x float> %23, %vec.phi182
|
|
|
|
%25 = add i32 %index, %mul11
|
|
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
|
|
%wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
|
|
|
|
%29 = fadd fast <4 x float> %28, %vec.phi180
|
|
|
|
%30 = add i32 %index, %mul13
|
|
|
|
%31 = getelementptr inbounds float, float* %2, i32 %30
|
|
|
|
%32 = bitcast float* %31 to <4 x float>*
|
|
|
|
%wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
|
|
|
|
%34 = fadd fast <4 x float> %33, %vec.phi178
|
|
|
|
%35 = add i32 %index, %mul15
|
|
|
|
%36 = getelementptr inbounds float, float* %2, i32 %35
|
|
|
|
%37 = bitcast float* %36 to <4 x float>*
|
|
|
|
%wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
|
|
|
|
%39 = fadd fast <4 x float> %38, %vec.phi
|
|
|
|
%40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi
|
|
|
|
%41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178
|
|
|
|
%42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179
|
|
|
|
%43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180
|
|
|
|
%44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181
|
|
|
|
%45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182
|
|
|
|
%46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%47 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %47, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
|
|
|
|
%49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
|
|
|
|
%50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44)
|
|
|
|
%51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43)
|
|
|
|
%52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42)
|
|
|
|
%53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41)
|
|
|
|
%54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
|
|
|
|
%arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177
|
|
|
|
store float %52, float* %arrayidx56, align 4
|
|
|
|
%arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %50, float* %arrayidx58, align 4
|
|
|
|
%arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
|
|
store float %48, float* %arrayidx60, align 4
|
|
|
|
%arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
|
|
store float %49, float* %arrayidx62, align 4
|
|
|
|
%arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
|
|
store float %51, float* %arrayidx64, align 4
|
|
|
|
%arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12
|
|
|
|
store float %53, float* %arrayidx66, align 4
|
|
|
|
%arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14
|
|
|
|
store float %54, float* %arrayidx68, align 4
|
|
|
|
%add70 = add i32 %k2.0177, 7
|
|
|
|
%cmp3 = icmp ult i32 %add70, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
|
|
; CHECK-LABEL: DCT_mve8:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: .pad #104
|
|
|
|
; CHECK-NEXT: sub sp, #104
|
|
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
|
|
; CHECK-NEXT: subs r1, #8
|
|
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: cmp r1, #2
|
|
|
|
; CHECK-NEXT: blo.w .LBB7_5
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
|
|
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
|
|
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
|
|
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
|
|
; CHECK-NEXT: movs r0, #1
|
|
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: lsls r1, r3, #5
|
|
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB7_2: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: vmov.i32 q3, #0x0
|
|
|
|
; CHECK-NEXT: adds r4, r0, #3
|
|
|
|
; CHECK-NEXT: add.w r8, r0, #2
|
|
|
|
; CHECK-NEXT: dls lr, r1
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: adds r1, r0, #7
|
|
|
|
; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #6
|
|
|
|
; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #5
|
|
|
|
; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
|
|
; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov r3, r9
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr.w r10, [sp, #20] @ 4-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vmov q5, q3
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r1, r0, #1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vmov q6, q3
|
|
|
|
; CHECK-NEXT: vmov q4, q3
|
|
|
|
; CHECK-NEXT: vmov q7, q3
|
|
|
|
; CHECK-NEXT: vmov q2, q3
|
|
|
|
; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
|
|
|
|
; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: .LBB7_3: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
|
|
; CHECK-NEXT: add.w r11, r3, r5
|
|
|
|
; CHECK-NEXT: vctp.32 r10
|
|
|
|
; CHECK-NEXT: vpsttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r12], #16
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
|
|
|
|
; CHECK-NEXT: vfmat.f32 q6, q1, q0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add.w r6, r11, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r11]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q7, q1, q0
|
|
|
|
; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
|
|
|
|
; CHECK-NEXT: vmov q7, q6
|
|
|
|
; CHECK-NEXT: vmov q6, q5
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vmov q5, q3
|
|
|
|
; CHECK-NEXT: vmov q3, q4
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpst
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
|
|
; CHECK-NEXT: vmov q4, q2
|
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: adds r7, r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpst
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q1, q0
|
|
|
|
; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r7]
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpst
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q1, q0
|
|
|
|
; CHECK-NEXT: adds r6, r7, r5
|
|
|
|
; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q2, q4
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vmov q4, q3
|
|
|
|
; CHECK-NEXT: vmov q3, q5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vmov q5, q6
|
|
|
|
; CHECK-NEXT: vmov q6, q7
|
|
|
|
; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r7, r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q2, q1, q0
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: sub.w r10, r10, #4
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adds r6, r7, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstttt
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r7]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q4, q1, q0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vfmat.f32 q5, q1, q0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: add r6, r5
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vpstt
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
|
|
; CHECK-NEXT: vfmat.f32 q3, q1, q0
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB7_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s30, s31
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s28, s29
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
|
|
; CHECK-NEXT: vadd.f32 s5, s14, s15
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s4, s26, s27
|
|
|
|
; CHECK-NEXT: vadd.f32 s6, s24, s25
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s14, s18, s19
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s7, s16, s17
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
|
|
; CHECK-NEXT: vadd.f32 s13, s10, s11
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s18, s19
|
|
|
|
; CHECK-NEXT: vadd.f32 s9, s16, s17
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s11, s18, s19
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s15, s16, s17
|
|
|
|
; CHECK-NEXT: vadd.f32 s2, s6, s4
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s6, s12, s5
|
|
|
|
; CHECK-NEXT: vadd.f32 s12, s7, s14
|
|
|
|
; CHECK-NEXT: vadd.f32 s10, s9, s10
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s13
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: adds r0, #8
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s14, s15, s11
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s1, s22, s23
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vadd.f32 s3, s20, s21
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vstr s10, [r1]
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: vstr s14, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: vadd.f32 s4, s3, s1
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vstr s8, [r1]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: vstr s12, [r1]
|
2020-11-02 00:24:23 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add r9, r1
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r0, r1
|
|
|
|
; CHECK-NEXT: blo.w .LBB7_2
|
|
|
|
; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #104
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
|
|
%cmp = icmp ugt i32 %0, 1
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%sub = add i32 %1, -8
|
|
|
|
%cmp3197 = icmp ugt i32 %sub, 1
|
|
|
|
br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %0, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
|
|
%k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
|
|
|
|
%mul4 = mul i32 %k2.0198, %0
|
|
|
|
%add = add nuw nsw i32 %k2.0198, 1
|
|
|
|
%mul5 = mul i32 %add, %0
|
|
|
|
%add6 = add nuw nsw i32 %k2.0198, 2
|
|
|
|
%mul7 = mul i32 %add6, %0
|
|
|
|
%add8 = add nuw nsw i32 %k2.0198, 3
|
|
|
|
%mul9 = mul i32 %add8, %0
|
|
|
|
%add10 = add nuw nsw i32 %k2.0198, 4
|
|
|
|
%mul11 = mul i32 %add10, %0
|
|
|
|
%add12 = add nuw nsw i32 %k2.0198, 5
|
|
|
|
%mul13 = mul i32 %add12, %0
|
|
|
|
%add14 = add nuw nsw i32 %k2.0198, 6
|
|
|
|
%mul15 = mul i32 %add14, %0
|
|
|
|
%add16 = add i32 %k2.0198, 7
|
|
|
|
%mul17 = mul i32 %add16, %0
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
|
|
|
|
%vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
|
|
|
|
%vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ]
|
|
|
|
%vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ]
|
|
|
|
%vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ]
|
|
|
|
%vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ]
|
|
|
|
%vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ]
|
|
|
|
%vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ]
|
|
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%5 = add i32 %index, %mul4
|
|
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
|
|
%wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
|
|
|
|
%9 = fadd fast <4 x float> %8, %vec.phi200
|
|
|
|
%10 = add i32 %index, %mul5
|
|
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
|
|
%wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
|
|
|
|
%14 = fadd fast <4 x float> %13, %vec.phi202
|
|
|
|
%15 = add i32 %index, %mul7
|
|
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
|
|
%wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
|
|
|
|
%19 = fadd fast <4 x float> %18, %vec.phi204
|
|
|
|
%20 = add i32 %index, %mul9
|
|
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
|
|
%wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
|
|
|
|
%24 = fadd fast <4 x float> %23, %vec.phi205
|
|
|
|
%25 = add i32 %index, %mul11
|
|
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
|
|
%wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
|
|
|
|
%29 = fadd fast <4 x float> %28, %vec.phi203
|
|
|
|
%30 = add i32 %index, %mul13
|
|
|
|
%31 = getelementptr inbounds float, float* %2, i32 %30
|
|
|
|
%32 = bitcast float* %31 to <4 x float>*
|
|
|
|
%wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
|
|
|
|
%34 = fadd fast <4 x float> %33, %vec.phi201
|
|
|
|
%35 = add i32 %index, %mul15
|
|
|
|
%36 = getelementptr inbounds float, float* %2, i32 %35
|
|
|
|
%37 = bitcast float* %36 to <4 x float>*
|
|
|
|
%wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
|
|
|
|
%39 = fadd fast <4 x float> %38, %vec.phi199
|
|
|
|
%40 = add i32 %index, %mul17
|
|
|
|
%41 = getelementptr inbounds float, float* %2, i32 %40
|
|
|
|
%42 = bitcast float* %41 to <4 x float>*
|
|
|
|
%wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
|
|
%43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
|
|
|
|
%44 = fadd fast <4 x float> %43, %vec.phi
|
|
|
|
%45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi
|
|
|
|
%46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199
|
|
|
|
%47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200
|
|
|
|
%48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201
|
|
|
|
%49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202
|
|
|
|
%50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203
|
|
|
|
%51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204
|
|
|
|
%52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%53 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %53, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52)
|
|
|
|
%55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51)
|
|
|
|
%56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50)
|
|
|
|
%57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49)
|
|
|
|
%58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48)
|
|
|
|
%59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47)
|
|
|
|
%60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
|
|
|
|
%61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
|
|
|
|
%arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198
|
|
|
|
store float %59, float* %arrayidx63, align 4
|
|
|
|
%arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
|
|
store float %57, float* %arrayidx65, align 4
|
|
|
|
%arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
|
|
store float %55, float* %arrayidx67, align 4
|
|
|
|
%arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
|
|
store float %54, float* %arrayidx69, align 4
|
|
|
|
%arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
|
|
store float %56, float* %arrayidx71, align 4
|
|
|
|
%arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12
|
|
|
|
store float %58, float* %arrayidx73, align 4
|
|
|
|
%arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14
|
|
|
|
store float %60, float* %arrayidx75, align 4
|
|
|
|
%arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16
|
|
|
|
store float %61, float* %arrayidx77, align 4
|
|
|
|
%add79 = add i32 %k2.0198, 8
|
|
|
|
%cmp3 = icmp ult i32 %add79, %sub
|
|
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
declare void @llvm.assume(i1 noundef)
|
|
|
|
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
|
|
|
|
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
|
|
|
|
declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
|