2020-04-06 17:26:40 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2020-04-22 23:33:11 +08:00
|
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
|
2020-04-06 17:26:40 +08:00
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_s_q31:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9}
|
|
|
|
; CHECK-NEXT: .pad #8
|
|
|
|
; CHECK-NEXT: sub sp, #8
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB0_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %entry
|
|
|
|
; CHECK-NEXT: cmp r3, #1
|
|
|
|
; CHECK-NEXT: bne .LBB0_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r7, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: mov r10, r1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov r11, r2
|
|
|
|
; CHECK-NEXT: b .LBB0_6
|
|
|
|
; CHECK-NEXT: .LBB0_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: bic r3, r3, #1
|
|
|
|
; CHECK-NEXT: subs r7, r3, #2
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: movs r6, #1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: adr r4, .LCPI0_0
|
|
|
|
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: add.w r11, r2, r3, lsl #2
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: add.w r10, r1, r3, lsl #2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: add.w r12, r0, r3, lsl #2
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: vmvn.i32 q1, #0x80000000
|
|
|
|
; CHECK-NEXT: .LBB0_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: ldrd r5, r4, [r0]
|
|
|
|
; CHECK-NEXT: mov.w r3, #-1
|
|
|
|
; CHECK-NEXT: ldrd r8, r7, [r1]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r0, #8
|
|
|
|
; CHECK-NEXT: smull r4, r7, r7, r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: adds r1, #8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: asrl r4, r7, #31
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: smull r6, r5, r8, r5
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: rsbs.w r9, r4, #-2147483648
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: sbcs r3, r7
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: asrl r6, r5, #31
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: csetm r9, ne
|
|
|
|
; CHECK-NEXT: rsbs.w r3, r6, #-2147483648
|
|
|
|
; CHECK-NEXT: mov.w r3, #-1
|
|
|
|
; CHECK-NEXT: vmov q4[2], q4[0], r6, r4
|
|
|
|
; CHECK-NEXT: sbcs r3, r5
|
|
|
|
; CHECK-NEXT: vmov q4[3], q4[1], r5, r7
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: mvn r6, #-2147483648
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q2[2], q2[0], r3, r9
|
|
|
|
; CHECK-NEXT: vmov q2[3], q2[1], r3, r9
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q3, q0, q2
|
|
|
|
; CHECK-NEXT: vand q2, q4, q2
|
|
|
|
; CHECK-NEXT: vorr q2, q2, q3
|
|
|
|
; CHECK-NEXT: vmov r4, s8
|
|
|
|
; CHECK-NEXT: vmov r3, s9
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r5, s10
|
|
|
|
; CHECK-NEXT: subs r4, r4, r6
|
|
|
|
; CHECK-NEXT: vmov r4, s11
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs r3, r3, #0
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: subs r5, r5, r6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.32 q3[1], r3
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: sbcs r4, r4, #0
|
|
|
|
; CHECK-NEXT: mov.w r4, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movlt r4, #1
|
|
|
|
; CHECK-NEXT: cmp r4, #0
|
|
|
|
; CHECK-NEXT: csetm r4, ne
|
|
|
|
; CHECK-NEXT: vmov q3[2], q3[0], r3, r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q4, q1, q3
|
|
|
|
; CHECK-NEXT: vand q2, q2, q3
|
|
|
|
; CHECK-NEXT: vorr q2, q2, q4
|
|
|
|
; CHECK-NEXT: vmov r3, s10
|
|
|
|
; CHECK-NEXT: vmov r4, s8
|
|
|
|
; CHECK-NEXT: strd r4, r3, [r2]
|
|
|
|
; CHECK-NEXT: adds r2, #8
|
|
|
|
; CHECK-NEXT: le lr, .LBB0_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
|
|
|
|
; CHECK-NEXT: cmp r7, r3
|
|
|
|
; CHECK-NEXT: beq .LBB0_8
|
|
|
|
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov.w r0, #-1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: mov.w r1, #-2147483648
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
|
|
; CHECK-NEXT: .LBB0_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r3, [r12], #4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: ldr r4, [r10], #4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: smull r4, r3, r4, r3
|
|
|
|
; CHECK-NEXT: asrl r4, r3, #31
|
|
|
|
; CHECK-NEXT: subs r5, r1, r4
|
|
|
|
; CHECK-NEXT: sbcs.w r5, r0, r3
|
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
2020-07-14 17:04:55 +08:00
|
|
|
; CHECK-NEXT: csel r4, r4, r1, ne
|
|
|
|
; CHECK-NEXT: csel r3, r3, r0, ne
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: subs r5, r4, r2
|
|
|
|
; CHECK-NEXT: sbcs r3, r3, #0
|
2020-07-14 17:04:55 +08:00
|
|
|
; CHECK-NEXT: csel r3, r4, r2, lt
|
|
|
|
; CHECK-NEXT: str r3, [r11], #4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB0_7
|
|
|
|
; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #8
|
|
|
|
; CHECK-NEXT: vpop {d8, d9}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.9:
|
|
|
|
; CHECK-NEXT: .LCPI0_0:
|
|
|
|
; CHECK-NEXT: .long 2147483648 @ 0x80000000
|
|
|
|
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
|
|
|
|
; CHECK-NEXT: .long 2147483648 @ 0x80000000
|
|
|
|
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
|
|
|
|
entry:
|
|
|
|
switch i32 %N, label %vector.ph [
|
|
|
|
i32 0, label %for.cond.cleanup
|
|
|
|
i32 1, label %for.body.preheader
|
|
|
|
]
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.vec = and i32 %N, -2
|
|
|
|
%ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i32, i32* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i32, i32* %pDst, i32 %index
|
|
|
|
%0 = bitcast i32* %next.gep to <2 x i32>*
|
|
|
|
%wide.load = load <2 x i32>, <2 x i32>* %0, align 4
|
|
|
|
%1 = sext <2 x i32> %wide.load to <2 x i64>
|
|
|
|
%2 = bitcast i32* %next.gep18 to <2 x i32>*
|
|
|
|
%wide.load20 = load <2 x i32>, <2 x i32>* %2, align 4
|
|
|
|
%3 = sext <2 x i32> %wide.load20 to <2 x i64>
|
|
|
|
%4 = mul nsw <2 x i64> %3, %1
|
|
|
|
%5 = ashr <2 x i64> %4, <i64 31, i64 31>
|
|
|
|
%6 = icmp sgt <2 x i64> %5, <i64 -2147483648, i64 -2147483648>
|
|
|
|
%7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 -2147483648, i64 -2147483648>
|
|
|
|
%8 = icmp slt <2 x i64> %7, <i64 2147483647, i64 2147483647>
|
|
|
|
%9 = select <2 x i1> %8, <2 x i64> %7, <2 x i64> <i64 2147483647, i64 2147483647>
|
|
|
|
%10 = trunc <2 x i64> %9 to <2 x i32>
|
|
|
|
%11 = bitcast i32* %next.gep19 to <2 x i32>*
|
|
|
|
store <2 x i32> %10, <2 x i32>* %11, align 4
|
|
|
|
%index.next = add i32 %index, 2
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry, %middle.block
|
|
|
|
%i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i32* [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i32* [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i32* [ %pDst, %entry ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
|
|
|
|
%pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
|
|
|
|
%pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
|
|
|
|
%pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
|
|
|
|
%13 = load i32, i32* %pSrcA.addr.011, align 4
|
|
|
|
%conv = sext i32 %13 to i64
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
|
|
|
|
%14 = load i32, i32* %pSrcB.addr.010, align 4
|
|
|
|
%conv2 = sext i32 %14 to i64
|
|
|
|
%mul = mul nsw i64 %conv2, %conv
|
|
|
|
%shr = ashr i64 %mul, 31
|
|
|
|
%15 = icmp sgt i64 %shr, -2147483648
|
|
|
|
%.val.i = select i1 %15, i64 %shr, i64 -2147483648
|
|
|
|
%16 = icmp slt i64 %.val.i, 2147483647
|
|
|
|
%retval.0.i = select i1 %16, i64 %.val.i, i64 2147483647
|
|
|
|
%conv3 = trunc i64 %retval.0.i to i32
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
|
|
|
|
store i32 %conv3, i32* %pDst.addr.09, align 4
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_4_q31:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: .pad #16
|
|
|
|
; CHECK-NEXT: sub sp, #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB1_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: movs r7, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #3
|
|
|
|
; CHECK-NEXT: bhi .LBB1_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: mov r9, r1
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: mov r11, r2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: b .LBB1_6
|
|
|
|
; CHECK-NEXT: .LBB1_3: @ %vector.ph
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: bic r3, r3, #3
|
|
|
|
; CHECK-NEXT: subs r7, r3, #4
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI1_0
|
|
|
|
; CHECK-NEXT: movs r6, #1
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI1_1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
|
|
|
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: add.w r11, r2, r3, lsl #2
|
|
|
|
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r3, lsl #2
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB1_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q3, [r1], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: mov.w r2, #-1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.f32 s16, s10
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.f32 s20, s14
|
|
|
|
; CHECK-NEXT: vmov.f32 s18, s11
|
|
|
|
; CHECK-NEXT: vmov.f32 s22, s15
|
|
|
|
; CHECK-NEXT: vmullb.s32 q6, q5, q4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov.f32 s14, s13
|
|
|
|
; CHECK-NEXT: vmov r7, s27
|
|
|
|
; CHECK-NEXT: vmov r4, s26
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: asrl r4, r7, #31
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r10, s24
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov.f32 s10, s9
|
|
|
|
; CHECK-NEXT: sbcs.w r5, r2, r7
|
|
|
|
; CHECK-NEXT: vmov r6, s12
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r5, s25
|
|
|
|
; CHECK-NEXT: csetm r8, ne
|
|
|
|
; CHECK-NEXT: asrl r10, r5, #31
|
|
|
|
; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
|
|
|
|
; CHECK-NEXT: vmov q6[2], q6[0], r10, r4
|
|
|
|
; CHECK-NEXT: sbcs.w r3, r2, r5
|
|
|
|
; CHECK-NEXT: vmov q6[3], q6[1], r5, r7
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
|
|
|
; CHECK-NEXT: vmov q4[2], q4[0], r3, r8
|
|
|
|
; CHECK-NEXT: vmov q4[3], q4[1], r3, r8
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: mvn r8, #-2147483648
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q5, q0, q4
|
|
|
|
; CHECK-NEXT: vand q4, q6, q4
|
|
|
|
; CHECK-NEXT: vorr q4, q4, q5
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s16
|
|
|
|
; CHECK-NEXT: vmov r3, s17
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: vmov r5, s18
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: subs.w r4, r4, r8
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s19
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: sbcs r3, r3, #0
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: subs.w r5, r5, r8
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov.32 q5[1], r3
|
|
|
|
; CHECK-NEXT: vmov r5, s8
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: sbcs r4, r4, #0
|
|
|
|
; CHECK-NEXT: mov.w r4, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: movlt r4, #1
|
|
|
|
; CHECK-NEXT: cmp r4, #0
|
|
|
|
; CHECK-NEXT: csetm r4, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q5[2], q5[0], r3, r4
|
|
|
|
; CHECK-NEXT: vmov r3, s10
|
|
|
|
; CHECK-NEXT: vmov r4, s14
|
|
|
|
; CHECK-NEXT: smull r6, r5, r6, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q6, q1, q5
|
|
|
|
; CHECK-NEXT: vand q4, q4, q5
|
|
|
|
; CHECK-NEXT: vorr q4, q4, q6
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: asrl r6, r5, #31
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: smull r4, r7, r4, r3
|
|
|
|
; CHECK-NEXT: asrl r4, r7, #31
|
|
|
|
; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
|
|
|
|
; CHECK-NEXT: vmov q5[2], q5[0], r6, r4
|
|
|
|
; CHECK-NEXT: sbcs.w r3, r2, r7
|
|
|
|
; CHECK-NEXT: vmov q5[3], q5[1], r5, r7
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: rsbs.w r1, r6, #-2147483648
|
|
|
|
; CHECK-NEXT: sbcs.w r1, r2, r5
|
|
|
|
; CHECK-NEXT: mov.w r1, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r1, #1
|
|
|
|
; CHECK-NEXT: cmp r1, #0
|
|
|
|
; CHECK-NEXT: csetm r1, ne
|
|
|
|
; CHECK-NEXT: vmov q2[2], q2[0], r1, r3
|
|
|
|
; CHECK-NEXT: vmov q2[3], q2[1], r1, r3
|
|
|
|
; CHECK-NEXT: ldrd r1, r2, [sp, #8] @ 8-byte Folded Reload
|
|
|
|
; CHECK-NEXT: vbic q3, q0, q2
|
|
|
|
; CHECK-NEXT: vand q2, q5, q2
|
|
|
|
; CHECK-NEXT: vorr q2, q2, q3
|
|
|
|
; CHECK-NEXT: vmov r3, s8
|
|
|
|
; CHECK-NEXT: vmov r4, s9
|
|
|
|
; CHECK-NEXT: subs.w r3, r3, r8
|
|
|
|
; CHECK-NEXT: sbcs r3, r4, #0
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s10
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: vmov r3, s11
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: csetm r5, ne
|
|
|
|
; CHECK-NEXT: vmov.32 q3[1], r5
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: subs.w r4, r4, r8
|
|
|
|
; CHECK-NEXT: sbcs r3, r3, #0
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q3[2], q3[0], r5, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q5, q1, q3
|
|
|
|
; CHECK-NEXT: vand q2, q2, q3
|
|
|
|
; CHECK-NEXT: vorr q2, q2, q5
|
|
|
|
; CHECK-NEXT: vmov.f32 s9, s10
|
|
|
|
; CHECK-NEXT: vmov.f32 s10, s16
|
|
|
|
; CHECK-NEXT: vmov.f32 s11, s18
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: vstrb.8 q2, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB1_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
|
|
|
|
; CHECK-NEXT: cmp r7, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: beq .LBB1_8
|
|
|
|
; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r7
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: mov.w r0, #-1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: mov.w r3, #-2147483648
|
|
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB1_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r12], #4
|
|
|
|
; CHECK-NEXT: ldr r4, [r9], #4
|
|
|
|
; CHECK-NEXT: smull r4, r1, r4, r1
|
|
|
|
; CHECK-NEXT: asrl r4, r1, #31
|
|
|
|
; CHECK-NEXT: subs r5, r3, r4
|
|
|
|
; CHECK-NEXT: sbcs.w r5, r0, r1
|
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movlt r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: csel r4, r4, r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: csel r1, r1, r0, ne
|
|
|
|
; CHECK-NEXT: subs r5, r4, r2
|
|
|
|
; CHECK-NEXT: sbcs r1, r1, #0
|
|
|
|
; CHECK-NEXT: csel r1, r4, r2, lt
|
|
|
|
; CHECK-NEXT: str r1, [r11], #4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB1_7
|
|
|
|
; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: add sp, #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.9:
|
|
|
|
; CHECK-NEXT: .LCPI1_0:
|
|
|
|
; CHECK-NEXT: .long 2147483648 @ 0x80000000
|
|
|
|
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
|
|
|
|
; CHECK-NEXT: .long 2147483648 @ 0x80000000
|
|
|
|
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
|
|
|
|
; CHECK-NEXT: .LCPI1_1:
|
|
|
|
; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i32* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i32* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i32* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -4
|
|
|
|
%ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i32, i32* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i32, i32* %pDst, i32 %index
|
|
|
|
%0 = bitcast i32* %next.gep to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %0, align 4
|
|
|
|
%1 = sext <4 x i32> %wide.load to <4 x i64>
|
|
|
|
%2 = bitcast i32* %next.gep18 to <4 x i32>*
|
|
|
|
%wide.load20 = load <4 x i32>, <4 x i32>* %2, align 4
|
|
|
|
%3 = sext <4 x i32> %wide.load20 to <4 x i64>
|
|
|
|
%4 = mul nsw <4 x i64> %3, %1
|
|
|
|
%5 = ashr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31>
|
|
|
|
%6 = icmp sgt <4 x i64> %5, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
|
|
|
|
%7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
|
|
|
|
%8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
|
|
|
|
%9 = select <4 x i1> %8, <4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
|
|
|
|
%10 = trunc <4 x i64> %9 to <4 x i32>
|
|
|
|
%11 = bitcast i32* %next.gep19 to <4 x i32>*
|
|
|
|
store <4 x i32> %10, <4 x i32>* %11, align 4
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
|
|
|
|
%13 = load i32, i32* %pSrcA.addr.011, align 4
|
|
|
|
%conv = sext i32 %13 to i64
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
|
|
|
|
%14 = load i32, i32* %pSrcB.addr.010, align 4
|
|
|
|
%conv2 = sext i32 %14 to i64
|
|
|
|
%mul = mul nsw i64 %conv2, %conv
|
|
|
|
%shr = ashr i64 %mul, 31
|
|
|
|
%15 = icmp sgt i64 %shr, -2147483648
|
|
|
|
%.val.i = select i1 %15, i64 %shr, i64 -2147483648
|
|
|
|
%16 = icmp slt i64 %.val.i, 2147483647
|
|
|
|
%retval.0.i = select i1 %16, i64 %.val.i, i64 2147483647
|
|
|
|
%conv3 = trunc i64 %retval.0.i to i32
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
|
|
|
|
store i32 %conv3, i32* %pDst.addr.09, align 4
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_4t_q31:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: .pad #16
|
|
|
|
; CHECK-NEXT: sub sp, #16
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB2_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: adds r7, r3, #3
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: movs r6, #1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: bic r7, r7, #3
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI2_1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: subs r7, #4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: adr r5, .LCPI2_2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [r4]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q3, [r5]
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adr r6, .LCPI2_0
|
|
|
|
; CHECK-NEXT: subs r7, r3, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r6]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: mov.w r9, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q1, r7
|
|
|
|
; CHECK-NEXT: mov.w r12, #-1
|
|
|
|
; CHECK-NEXT: mvn r8, #-2147483648
|
2020-10-20 15:55:21 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB2_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vdup.32 q4, r9
|
|
|
|
; CHECK-NEXT: add.w r9, r9, #4
|
|
|
|
; CHECK-NEXT: vorr q4, q4, q0
|
|
|
|
; CHECK-NEXT: vpt.u32 cs, q1, q4
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
|
|
|
|
; CHECK-NEXT: vmov.f32 s24, s18
|
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vldrwt.u32 q5, [r1], #16
|
|
|
|
; CHECK-NEXT: vmov.f32 s28, s22
|
|
|
|
; CHECK-NEXT: vmov.f32 s26, s19
|
|
|
|
; CHECK-NEXT: vmov.f32 s30, s23
|
|
|
|
; CHECK-NEXT: vmullb.s32 q0, q7, q6
|
|
|
|
; CHECK-NEXT: vmov.f32 s18, s17
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r5, s3
|
|
|
|
; CHECK-NEXT: vmov r6, s2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: asrl r6, r5, #31
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s0
|
|
|
|
; CHECK-NEXT: rsbs.w r7, r6, #-2147483648
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.f32 s22, s21
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: sbcs.w r7, r12, r5
|
|
|
|
; CHECK-NEXT: mov.w r7, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movlt r7, #1
|
|
|
|
; CHECK-NEXT: cmp r7, #0
|
|
|
|
; CHECK-NEXT: vmov r7, s1
|
|
|
|
; CHECK-NEXT: csetm r10, ne
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: asrl r4, r7, #31
|
|
|
|
; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q7[2], q7[0], r4, r6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs.w r3, r12, r7
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q7[3], q7[1], r7, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r7, s20
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r3, r10
|
|
|
|
; CHECK-NEXT: vmov q0[3], q0[1], r3, r10
|
|
|
|
; CHECK-NEXT: vbic q6, q2, q0
|
|
|
|
; CHECK-NEXT: vand q0, q7, q0
|
|
|
|
; CHECK-NEXT: vorr q6, q0, q6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s24
|
|
|
|
; CHECK-NEXT: vmov r3, s25
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r5, s26
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: subs.w r4, r4, r8
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s27
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs r3, r3, #0
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: subs.w r5, r5, r8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.32 q0[1], r3
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: sbcs r4, r4, #0
|
|
|
|
; CHECK-NEXT: mov.w r4, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movlt r4, #1
|
|
|
|
; CHECK-NEXT: cmp r4, #0
|
|
|
|
; CHECK-NEXT: csetm r4, ne
|
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r3, r4
|
|
|
|
; CHECK-NEXT: vmov r3, s18
|
|
|
|
; CHECK-NEXT: vmov r4, s22
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q7, q3, q0
|
|
|
|
; CHECK-NEXT: vand q0, q6, q0
|
|
|
|
; CHECK-NEXT: vorr q6, q0, q7
|
|
|
|
; CHECK-NEXT: smull r6, r5, r4, r3
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: asrl r6, r5, #31
|
|
|
|
; CHECK-NEXT: rsbs.w r3, r6, #-2147483648
|
|
|
|
; CHECK-NEXT: sbcs.w r3, r12, r5
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: csetm r10, ne
|
|
|
|
; CHECK-NEXT: smull r4, r7, r7, r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: asrl r4, r7, #31
|
|
|
|
; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q5[2], q5[0], r4, r6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs.w r3, r12, r7
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q5[3], q5[1], r7, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r3, r10
|
|
|
|
; CHECK-NEXT: vmov q0[3], q0[1], r3, r10
|
|
|
|
; CHECK-NEXT: vbic q4, q2, q0
|
|
|
|
; CHECK-NEXT: vand q0, q5, q0
|
|
|
|
; CHECK-NEXT: vorr q4, q0, q4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s16
|
|
|
|
; CHECK-NEXT: vmov r3, s17
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r5, s18
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: subs.w r4, r4, r8
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s19
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs r3, r3, #0
|
|
|
|
; CHECK-NEXT: mov.w r3, #0
|
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: movlt r3, #1
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: csetm r3, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: subs.w r5, r5, r8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.32 q0[1], r3
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: sbcs r4, r4, #0
|
|
|
|
; CHECK-NEXT: mov.w r4, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lt
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movlt r4, #1
|
|
|
|
; CHECK-NEXT: cmp r4, #0
|
|
|
|
; CHECK-NEXT: csetm r4, ne
|
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r3, r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vbic q5, q3, q0
|
|
|
|
; CHECK-NEXT: vand q0, q4, q0
|
|
|
|
; CHECK-NEXT: vorr q0, q0, q5
|
|
|
|
; CHECK-NEXT: vmov.f32 s1, s2
|
|
|
|
; CHECK-NEXT: vmov.f32 s2, s24
|
|
|
|
; CHECK-NEXT: vmov.f32 s3, s26
|
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vstrwt.32 q0, [r2], #16
|
|
|
|
; CHECK-NEXT: le lr, .LBB2_2
|
|
|
|
; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #16
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.4:
|
|
|
|
; CHECK-NEXT: .LCPI2_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
; CHECK-NEXT: .LCPI2_1:
|
|
|
|
; CHECK-NEXT: .long 2147483648 @ 0x80000000
|
|
|
|
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
|
|
|
|
; CHECK-NEXT: .long 2147483648 @ 0x80000000
|
|
|
|
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
|
|
|
|
; CHECK-NEXT: .LCPI2_2:
|
|
|
|
; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%next.gep = getelementptr i32, i32* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i32, i32* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <4 x i32> %induction, %broadcast.splat21
|
|
|
|
%1 = bitcast i32* %next.gep to <4 x i32>*
|
|
|
|
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> undef)
|
|
|
|
%2 = sext <4 x i32> %wide.masked.load to <4 x i64>
|
|
|
|
%3 = bitcast i32* %next.gep18 to <4 x i32>*
|
|
|
|
%wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> undef)
|
|
|
|
%4 = sext <4 x i32> %wide.masked.load22 to <4 x i64>
|
|
|
|
%5 = mul nsw <4 x i64> %4, %2
|
|
|
|
%6 = ashr <4 x i64> %5, <i64 31, i64 31, i64 31, i64 31>
|
|
|
|
%7 = icmp sgt <4 x i64> %6, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
|
|
|
|
%8 = select <4 x i1> %7, <4 x i64> %6, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
|
|
|
|
%9 = icmp slt <4 x i64> %8, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
|
|
|
|
%10 = select <4 x i1> %9, <4 x i64> %8, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
|
|
|
|
%11 = trunc <4 x i64> %10 to <4 x i32>
|
|
|
|
%12 = bitcast i32* %next.gep19 to <4 x i32>*
|
|
|
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %11, <4 x i32>* %12, i32 4, <4 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%13 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %13, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: usatmul_2_q31:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB3_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %entry
|
|
|
|
; CHECK-NEXT: cmp r3, #1
|
|
|
|
; CHECK-NEXT: bne .LBB3_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r7, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r11, r1
|
|
|
|
; CHECK-NEXT: mov r8, r2
|
|
|
|
; CHECK-NEXT: b .LBB3_6
|
|
|
|
; CHECK-NEXT: .LBB3_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #1
|
|
|
|
; CHECK-NEXT: movs r6, #1
|
|
|
|
; CHECK-NEXT: subs r7, r5, #2
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: add.w r8, r2, r5, lsl #2
|
|
|
|
; CHECK-NEXT: add.w r11, r1, r5, lsl #2
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: add.w r12, r0, r5, lsl #2
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB3_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: ldrd r4, r7, [r0]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r0, #8
|
|
|
|
; CHECK-NEXT: ldrd r5, r10, [r1]
|
|
|
|
; CHECK-NEXT: adds r1, #8
|
|
|
|
; CHECK-NEXT: umull r4, r5, r5, r4
|
|
|
|
; CHECK-NEXT: lsrl r4, r5, #31
|
|
|
|
; CHECK-NEXT: subs.w r6, r4, #-1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: umull r6, r7, r10, r7
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs r5, r5, #0
|
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: movlo r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: lsrl r6, r7, #31
|
|
|
|
; CHECK-NEXT: csetm r9, ne
|
|
|
|
; CHECK-NEXT: subs.w r5, r6, #-1
|
|
|
|
; CHECK-NEXT: vmov.32 q0[1], r9
|
|
|
|
; CHECK-NEXT: sbcs r5, r7, #0
|
|
|
|
; CHECK-NEXT: vmov q1[2], q1[0], r4, r6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: movlo r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
|
|
|
; CHECK-NEXT: csetm r5, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r9, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vand q1, q1, q0
|
|
|
|
; CHECK-NEXT: vorn q0, q1, q0
|
|
|
|
; CHECK-NEXT: vmov r4, s2
|
|
|
|
; CHECK-NEXT: vmov r5, s0
|
|
|
|
; CHECK-NEXT: strd r5, r4, [r2]
|
|
|
|
; CHECK-NEXT: adds r2, #8
|
|
|
|
; CHECK-NEXT: le lr, .LBB3_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: cmp r7, r3
|
|
|
|
; CHECK-NEXT: beq .LBB3_8
|
|
|
|
; CHECK-NEXT: .LBB3_6: @ %for.body.preheader
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r7
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB3_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r0, [r12], #4
|
|
|
|
; CHECK-NEXT: ldr r1, [r11], #4
|
|
|
|
; CHECK-NEXT: umull r0, r1, r1, r0
|
|
|
|
; CHECK-NEXT: lsrl r0, r1, #31
|
|
|
|
; CHECK-NEXT: subs.w r2, r0, #-1
|
|
|
|
; CHECK-NEXT: sbcs r1, r1, #0
|
|
|
|
; CHECK-NEXT: it hs
|
|
|
|
; CHECK-NEXT: movhs.w r0, #-1
|
|
|
|
; CHECK-NEXT: str r0, [r8], #4
|
|
|
|
; CHECK-NEXT: le lr, .LBB3_7
|
|
|
|
; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
switch i32 %N, label %vector.ph [
|
|
|
|
i32 0, label %for.cond.cleanup
|
|
|
|
i32 1, label %for.body.preheader
|
|
|
|
]
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.vec = and i32 %N, -2
|
|
|
|
%ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i32, i32* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i32, i32* %pDst, i32 %index
|
|
|
|
%0 = bitcast i32* %next.gep to <2 x i32>*
|
|
|
|
%wide.load = load <2 x i32>, <2 x i32>* %0, align 4
|
|
|
|
%1 = zext <2 x i32> %wide.load to <2 x i64>
|
|
|
|
%2 = bitcast i32* %next.gep18 to <2 x i32>*
|
|
|
|
%wide.load20 = load <2 x i32>, <2 x i32>* %2, align 4
|
|
|
|
%3 = zext <2 x i32> %wide.load20 to <2 x i64>
|
|
|
|
%4 = mul nuw <2 x i64> %3, %1
|
|
|
|
%5 = lshr <2 x i64> %4, <i64 31, i64 31>
|
|
|
|
%6 = icmp ult <2 x i64> %5, <i64 4294967295, i64 4294967295>
|
|
|
|
%7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 4294967295, i64 4294967295>
|
|
|
|
%8 = trunc <2 x i64> %7 to <2 x i32>
|
|
|
|
%9 = bitcast i32* %next.gep19 to <2 x i32>*
|
|
|
|
store <2 x i32> %8, <2 x i32>* %9, align 4
|
|
|
|
%index.next = add i32 %index, 2
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry, %middle.block
|
|
|
|
%i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i32* [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i32* [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i32* [ %pDst, %entry ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
|
|
|
|
%pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
|
|
|
|
%pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
|
|
|
|
%pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
|
|
|
|
%11 = load i32, i32* %pSrcA.addr.011, align 4
|
|
|
|
%conv = zext i32 %11 to i64
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
|
|
|
|
%12 = load i32, i32* %pSrcB.addr.010, align 4
|
|
|
|
%conv2 = zext i32 %12 to i64
|
|
|
|
%mul = mul nuw i64 %conv2, %conv
|
|
|
|
%shr = lshr i64 %mul, 31
|
|
|
|
%13 = icmp ult i64 %shr, 4294967295
|
|
|
|
%retval.0.i = select i1 %13, i64 %shr, i64 4294967295
|
|
|
|
%conv3 = trunc i64 %retval.0.i to i32
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
|
|
|
|
store i32 %conv3, i32* %pDst.addr.09, align 4
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: usatmul_4_q31:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .vsave {d8, d9}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB4_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: mov.w r8, #0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #3
|
|
|
|
; CHECK-NEXT: bhi .LBB4_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: mov r9, r1
|
|
|
|
; CHECK-NEXT: mov r10, r2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: b .LBB4_6
|
|
|
|
; CHECK-NEXT: .LBB4_3: @ %vector.ph
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: bic r8, r3, #3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: movs r6, #1
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: sub.w r7, r8, #4
|
|
|
|
; CHECK-NEXT: add.w r10, r2, r8, lsl #2
|
|
|
|
; CHECK-NEXT: add.w r9, r1, r8, lsl #2
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: add.w r12, r0, r8, lsl #2
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB4_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
|
|
; CHECK-NEXT: vmov.f32 s4, s2
|
|
|
|
; CHECK-NEXT: vmov.f32 s12, s10
|
|
|
|
; CHECK-NEXT: vmov.f32 s6, s3
|
|
|
|
; CHECK-NEXT: vmov.f32 s14, s11
|
|
|
|
; CHECK-NEXT: vmullb.u32 q4, q3, q1
|
|
|
|
; CHECK-NEXT: vmov.f32 s2, s1
|
|
|
|
; CHECK-NEXT: vmov r5, s17
|
|
|
|
; CHECK-NEXT: vmov r4, s16
|
|
|
|
; CHECK-NEXT: lsrl r4, r5, #31
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r7, s19
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: subs.w r6, r4, #-1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov.f32 s10, s9
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sbcs r5, r5, #0
|
|
|
|
; CHECK-NEXT: vmov r6, s18
|
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: lsrl r6, r7, #31
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: movlo r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: csetm r11, ne
|
|
|
|
; CHECK-NEXT: subs.w r5, r6, #-1
|
|
|
|
; CHECK-NEXT: sbcs r5, r7, #0
|
|
|
|
; CHECK-NEXT: vmov.32 q1[1], r11
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q3[2], q3[0], r4, r6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: movlo r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
|
|
|
; CHECK-NEXT: csetm r5, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q1[2], q1[0], r11, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vand q3, q3, q1
|
|
|
|
; CHECK-NEXT: vorn q1, q3, q1
|
|
|
|
; CHECK-NEXT: vmullb.u32 q3, q2, q0
|
|
|
|
; CHECK-NEXT: vmov r5, s13
|
|
|
|
; CHECK-NEXT: vmov r4, s12
|
|
|
|
; CHECK-NEXT: lsrl r4, r5, #31
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r7, s15
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: subs.w r6, r4, #-1
|
|
|
|
; CHECK-NEXT: sbcs r5, r5, #0
|
|
|
|
; CHECK-NEXT: vmov r6, s14
|
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: lsrl r6, r7, #31
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: movlo r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: csetm r11, ne
|
|
|
|
; CHECK-NEXT: subs.w r5, r6, #-1
|
|
|
|
; CHECK-NEXT: sbcs r5, r7, #0
|
|
|
|
; CHECK-NEXT: vmov.32 q0[1], r11
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: mov.w r5, #0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q2[2], q2[0], r4, r6
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: movlo r5, #1
|
|
|
|
; CHECK-NEXT: cmp r5, #0
|
|
|
|
; CHECK-NEXT: csetm r5, ne
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r11, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vand q2, q2, q0
|
|
|
|
; CHECK-NEXT: vorn q0, q2, q0
|
|
|
|
; CHECK-NEXT: vmov.f32 s1, s2
|
|
|
|
; CHECK-NEXT: vmov.f32 s2, s4
|
|
|
|
; CHECK-NEXT: vmov.f32 s3, s6
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB4_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r8, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: beq .LBB4_8
|
|
|
|
; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r8
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB4_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldr r0, [r12], #4
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: ldr r1, [r9], #4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: umull r0, r1, r1, r0
|
|
|
|
; CHECK-NEXT: lsrl r0, r1, #31
|
|
|
|
; CHECK-NEXT: subs.w r2, r0, #-1
|
|
|
|
; CHECK-NEXT: sbcs r1, r1, #0
|
|
|
|
; CHECK-NEXT: it hs
|
|
|
|
; CHECK-NEXT: movhs.w r0, #-1
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: str r0, [r10], #4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB4_7
|
|
|
|
; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vpop {d8, d9}
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i32* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i32* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i32* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -4
|
|
|
|
%ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i32, i32* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i32, i32* %pDst, i32 %index
|
|
|
|
%0 = bitcast i32* %next.gep to <4 x i32>*
|
|
|
|
%wide.load = load <4 x i32>, <4 x i32>* %0, align 4
|
|
|
|
%1 = zext <4 x i32> %wide.load to <4 x i64>
|
|
|
|
%2 = bitcast i32* %next.gep18 to <4 x i32>*
|
|
|
|
%wide.load20 = load <4 x i32>, <4 x i32>* %2, align 4
|
|
|
|
%3 = zext <4 x i32> %wide.load20 to <4 x i64>
|
|
|
|
%4 = mul nuw <4 x i64> %3, %1
|
|
|
|
%5 = lshr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31>
|
|
|
|
%6 = icmp ult <4 x i64> %5, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
|
|
|
|
%7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
|
|
|
|
%8 = trunc <4 x i64> %7 to <4 x i32>
|
|
|
|
%9 = bitcast i32* %next.gep19 to <4 x i32>*
|
|
|
|
store <4 x i32> %8, <4 x i32>* %9, align 4
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
|
|
|
|
%11 = load i32, i32* %pSrcA.addr.011, align 4
|
|
|
|
%conv = zext i32 %11 to i64
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
|
|
|
|
%12 = load i32, i32* %pSrcB.addr.010, align 4
|
|
|
|
%conv2 = zext i32 %12 to i64
|
|
|
|
%mul = mul nuw i64 %conv2, %conv
|
|
|
|
%shr = lshr i64 %mul, 31
|
|
|
|
%13 = icmp ult i64 %shr, 4294967295
|
|
|
|
%retval.0.i = select i1 %13, i64 %shr, i64 4294967295
|
|
|
|
%conv3 = trunc i64 %retval.0.i to i32
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
|
|
|
|
store i32 %conv3, i32* %pDst.addr.09, align 4
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
; i16
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_4_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_4_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: cbz r3, .LBB5_8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #3
|
|
|
|
; CHECK-NEXT: bhi .LBB5_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB5_6
|
|
|
|
; CHECK-NEXT: .LBB5_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #3
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: subs r6, r5, #4
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #2
|
|
|
|
; CHECK-NEXT: add.w r4, r2, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r6, r1, r5, lsl #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB5_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrh.s32 q0, [r0], #8
|
|
|
|
; CHECK-NEXT: vldrh.s32 q1, [r1], #8
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrh.32 q0, [r2], #8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB5_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB5_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: ldrsh r0, [r12], #2
|
|
|
|
; CHECK-NEXT: ldrsh r1, [r6], #2
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #16, r0, asr #15
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: strh r0, [r4], #2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB5_7
|
|
|
|
; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -4
|
|
|
|
%ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = bitcast i16* %next.gep to <4 x i16>*
|
|
|
|
%wide.load = load <4 x i16>, <4 x i16>* %0, align 2
|
|
|
|
%1 = sext <4 x i16> %wide.load to <4 x i32>
|
|
|
|
%2 = bitcast i16* %next.gep18 to <4 x i16>*
|
|
|
|
%wide.load20 = load <4 x i16>, <4 x i16>* %2, align 2
|
|
|
|
%3 = sext <4 x i16> %wide.load20 to <4 x i32>
|
|
|
|
%4 = mul nsw <4 x i32> %3, %1
|
|
|
|
%5 = ashr <4 x i32> %4, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%6 = icmp sgt <4 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%8 = icmp slt <4 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%9 = select <4 x i1> %8, <4 x i32> %7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%10 = trunc <4 x i32> %9 to <4 x i16>
|
|
|
|
%11 = bitcast i16* %next.gep19 to <4 x i16>*
|
|
|
|
store <4 x i16> %10, <4 x i16>* %11, align 2
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
|
|
|
|
%13 = load i16, i16* %pSrcA.addr.011, align 2
|
|
|
|
%conv = sext i16 %13 to i32
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
|
|
|
|
%14 = load i16, i16* %pSrcB.addr.010, align 2
|
|
|
|
%conv2 = sext i16 %14 to i32
|
|
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
|
|
%shr = ashr i32 %mul, 15
|
|
|
|
%15 = icmp sgt i32 %shr, -32768
|
|
|
|
%.val.i = select i1 %15, i32 %shr, i32 -32768
|
|
|
|
%16 = icmp slt i32 %.val.i, 32767
|
|
|
|
%retval.0.i = select i1 %16, i32 %.val.i, i32 32767
|
|
|
|
%conv3 = trunc i32 %retval.0.i to i16
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
|
|
|
|
store i16 %conv3, i16* %pDst.addr.09, align 2
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_8_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB6_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #7
|
|
|
|
; CHECK-NEXT: bhi .LBB6_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB6_6
|
|
|
|
; CHECK-NEXT: .LBB6_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #7
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #8
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #3
|
|
|
|
; CHECK-NEXT: add.w r4, r2, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r6, r1, r5, lsl #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB6_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
|
|
|
|
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
|
|
|
; CHECK-NEXT: vldrh.s32 q1, [r1], #16
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrh.32 q0, [r2, #8]
|
|
|
|
; CHECK-NEXT: vldrh.s32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrh.32 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB6_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB6_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB6_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: ldrsh r0, [r12], #2
|
|
|
|
; CHECK-NEXT: ldrsh r1, [r6], #2
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #16, r0, asr #15
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: strh r0, [r4], #2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB6_7
|
|
|
|
; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 8
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -8
|
|
|
|
%ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = bitcast i16* %next.gep to <8 x i16>*
|
|
|
|
%wide.load = load <8 x i16>, <8 x i16>* %0, align 2
|
|
|
|
%1 = sext <8 x i16> %wide.load to <8 x i32>
|
|
|
|
%2 = bitcast i16* %next.gep18 to <8 x i16>*
|
|
|
|
%wide.load20 = load <8 x i16>, <8 x i16>* %2, align 2
|
|
|
|
%3 = sext <8 x i16> %wide.load20 to <8 x i32>
|
|
|
|
%4 = mul nsw <8 x i32> %3, %1
|
|
|
|
%5 = ashr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%9 = select <8 x i1> %8, <8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%10 = trunc <8 x i32> %9 to <8 x i16>
|
|
|
|
%11 = bitcast i16* %next.gep19 to <8 x i16>*
|
|
|
|
store <8 x i16> %10, <8 x i16>* %11, align 2
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
|
|
|
|
%13 = load i16, i16* %pSrcA.addr.011, align 2
|
|
|
|
%conv = sext i16 %13 to i32
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
|
|
|
|
%14 = load i16, i16* %pSrcB.addr.010, align 2
|
|
|
|
%conv2 = sext i16 %14 to i32
|
|
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
|
|
%shr = ashr i32 %mul, 15
|
|
|
|
%15 = icmp sgt i32 %shr, -32768
|
|
|
|
%.val.i = select i1 %15, i32 %shr, i32 -32768
|
|
|
|
%16 = icmp slt i32 %.val.i, 32767
|
|
|
|
%retval.0.i = select i1 %16, i32 %.val.i, i32 32767
|
|
|
|
%conv3 = trunc i32 %retval.0.i to i16
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
|
|
|
|
store i16 %conv3, i16* %pDst.addr.09, align 2
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_8i_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_8i_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: cbz r3, .LBB7_8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #7
|
|
|
|
; CHECK-NEXT: bhi .LBB7_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB7_6
|
|
|
|
; CHECK-NEXT: .LBB7_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #7
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #8
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #3
|
|
|
|
; CHECK-NEXT: add.w r4, r2, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r6, r1, r5, lsl #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB7_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmullt.s16 q2, q1, q0
|
|
|
|
; CHECK-NEXT: vmullb.s16 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
|
|
|
|
; CHECK-NEXT: vqshrnt.s32 q0, q2, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB7_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB7_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: ldrsh r0, [r12], #2
|
|
|
|
; CHECK-NEXT: ldrsh r1, [r6], #2
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #16, r0, asr #15
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: strh r0, [r4], #2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB7_7
|
|
|
|
; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 8
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -8
|
|
|
|
%ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = bitcast i16* %next.gep to <8 x i16>*
|
|
|
|
%wide.load = load <8 x i16>, <8 x i16>* %0, align 2
|
|
|
|
%1 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
|
|
%2 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
|
|
%3 = sext <4 x i16> %1 to <4 x i32>
|
|
|
|
%4 = sext <4 x i16> %2 to <4 x i32>
|
|
|
|
%5 = bitcast i16* %next.gep18 to <8 x i16>*
|
|
|
|
%wide.load20 = load <8 x i16>, <8 x i16>* %5, align 2
|
|
|
|
%6 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
|
|
%7 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
|
|
%8 = sext <4 x i16> %6 to <4 x i32>
|
|
|
|
%9 = sext <4 x i16> %7 to <4 x i32>
|
|
|
|
%10 = mul <4 x i32> %8, %3
|
|
|
|
%11 = mul <4 x i32> %9, %4
|
|
|
|
%12 = ashr <4 x i32> %10, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%13 = ashr <4 x i32> %11, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%14 = icmp sgt <4 x i32> %12, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%15 = icmp sgt <4 x i32> %13, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%16 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%17 = select <4 x i1> %15, <4 x i32> %13, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%18 = icmp slt <4 x i32> %16, <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%19 = icmp slt <4 x i32> %17, <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%20 = select <4 x i1> %18, <4 x i32> %16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%21 = select <4 x i1> %19, <4 x i32> %17, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%22 = shufflevector <4 x i32> %20, <4 x i32> %21, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
|
|
|
|
%23 = trunc <8 x i32> %22 to <8 x i16>
|
|
|
|
%24 = bitcast i16* %next.gep19 to <8 x i16>*
|
|
|
|
store <8 x i16> %23, <8 x i16>* %24, align 2
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%25 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %25, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader21
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
|
|
|
|
%26 = load i16, i16* %pSrcA.addr.011, align 2
|
|
|
|
%conv = sext i16 %26 to i32
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
|
|
|
|
%27 = load i16, i16* %pSrcB.addr.010, align 2
|
|
|
|
%conv2 = sext i16 %27 to i32
|
|
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
|
|
%shr = ashr i32 %mul, 15
|
|
|
|
%28 = icmp sgt i32 %shr, -32768
|
|
|
|
%.val.i = select i1 %28, i32 %shr, i32 -32768
|
|
|
|
%29 = icmp slt i32 %.val.i, 32767
|
|
|
|
%retval.0.i = select i1 %29, i32 %.val.i, i32 32767
|
|
|
|
%conv3 = trunc i32 %retval.0.i to i16
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
|
|
|
|
store i16 %conv3, i16* %pDst.addr.09, align 2
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_s4t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_s4t_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
|
|
; CHECK-NEXT: push {r4, lr}
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, pc}
|
2020-08-27 14:09:25 +08:00
|
|
|
; CHECK-NEXT: .LBB8_1: @ %vector.ph
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: add.w r12, r3, #3
|
|
|
|
; CHECK-NEXT: mov.w lr, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: bic r12, r12, #3
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI8_0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r12, #4
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: vdup.32 q1, r12
|
2020-08-27 14:09:25 +08:00
|
|
|
; CHECK-NEXT: .LBB8_2: @ %vector.body
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q2, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r3, #4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vorr q2, q2, q0
|
|
|
|
; CHECK-NEXT: vptt.u32 cs, q1, q2
|
|
|
|
; CHECK-NEXT: vldrht.s32 q2, [r0], #8
|
|
|
|
; CHECK-NEXT: vldrht.s32 q3, [r1], #8
|
|
|
|
; CHECK-NEXT: vmul.i32 q2, q3, q2
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q2, q2, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.s16 q2, q2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrht.32 q2, [r2], #8
|
2020-08-27 14:09:25 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB8_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
2020-08-27 14:09:25 +08:00
|
|
|
; CHECK-NEXT: @ %bb.4:
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LCPI8_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 3
|
|
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <4 x i32> %induction, %broadcast.splat21
|
|
|
|
%1 = bitcast i16* %next.gep to <4 x i16>*
|
|
|
|
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> undef)
|
|
|
|
%2 = sext <4 x i16> %wide.masked.load to <4 x i32>
|
|
|
|
%3 = bitcast i16* %next.gep18 to <4 x i16>*
|
|
|
|
%wide.masked.load22 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %3, i32 2, <4 x i1> %0, <4 x i16> undef)
|
|
|
|
%4 = sext <4 x i16> %wide.masked.load22 to <4 x i32>
|
|
|
|
%5 = mul nsw <4 x i32> %4, %2
|
|
|
|
%6 = ashr <4 x i32> %5, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%7 = icmp sgt <4 x i32> %6, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%8 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%9 = icmp slt <4 x i32> %8, <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%11 = trunc <4 x i32> %10 to <4 x i16>
|
|
|
|
%12 = bitcast i16* %next.gep19 to <4 x i16>*
|
|
|
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %11, <4 x i16>* %12, i32 2, <4 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%13 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %13, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_8t_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r7, lr}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: .pad #16
|
|
|
|
; CHECK-NEXT: sub sp, #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB9_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: add.w r12, r3, #7
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI9_0
|
|
|
|
; CHECK-NEXT: bic r12, r12, #7
|
|
|
|
; CHECK-NEXT: mov.w lr, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r12, #8
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adr r4, .LCPI9_1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: movs r5, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, lr, r12, lsr #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [r4]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q1, r12
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov.i8 q2, #0x0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.i8 q3, #0xff
|
2020-10-20 15:55:21 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB9_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vdup.32 q6, r5
|
|
|
|
; CHECK-NEXT: adds r5, #8
|
|
|
|
; CHECK-NEXT: vorr q5, q6, q0
|
|
|
|
; CHECK-NEXT: vorr q6, q6, q4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q5
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vpsel q7, q3, q2
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q6
|
|
|
|
; CHECK-NEXT: vmov r4, s28
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vpsel q6, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s29
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s30
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s31
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s24
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s25
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s26
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r4
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s27
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r4
|
|
|
|
; CHECK-NEXT: vpt.i16 ne, q5, zr
|
|
|
|
; CHECK-NEXT: vldrht.u16 q6, [r0], #16
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q6[2]
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q6[0]
|
|
|
|
; CHECK-NEXT: vmov q5[2], q5[0], r3, r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q6[3]
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q6[1]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrht.u16 q7, [r1], #16
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q5[3], q5[1], r4, r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q7[2]
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q7[0]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q7[3]
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q7[1]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q6[4]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmullb.s16 q0, q0, q5
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.s16 q0, q0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r3, s0
|
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r3
|
|
|
|
; CHECK-NEXT: vmov r3, s1
|
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r3
|
|
|
|
; CHECK-NEXT: vmov r3, s2
|
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r3
|
|
|
|
; CHECK-NEXT: vmov r3, s3
|
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q6[6]
|
|
|
|
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q6[7]
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q6[5]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q7[6]
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q7[4]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q6[2], q6[0], r4, r3
|
|
|
|
; CHECK-NEXT: vmov.u16 r3, q7[7]
|
2020-12-18 21:33:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q7[5]
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov q6[3], q6[1], r4, r3
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmullb.s16 q0, q6, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.s16 q0, q0
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: vmov r3, s0
|
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r3
|
|
|
|
; CHECK-NEXT: vmov r3, s1
|
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r3
|
|
|
|
; CHECK-NEXT: vmov r3, s2
|
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r3
|
|
|
|
; CHECK-NEXT: vmov r3, s3
|
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrht.16 q5, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB9_2
|
|
|
|
; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: add sp, #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
[ARM] Match dual lane vmovs from insert_vector_elt
MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
vmov q0[2], q0[0], r2, r0
vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.
This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:
3 2 1 0 -> vmovqrr 31; vmovqrr 20
3 2 1 -> vmovqrr 31; vmov 2
3 1 -> vmovqrr 31
2 1 0 -> vmovqrr 20; vmov 1
2 0 -> vmovqrr 20
With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.
This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.
2020-12-19 00:13:08 +08:00
|
|
|
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.4:
|
|
|
|
; CHECK-NEXT: .LCPI9_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
; CHECK-NEXT: .LCPI9_1:
|
|
|
|
; CHECK-NEXT: .long 4 @ 0x4
|
|
|
|
; CHECK-NEXT: .long 5 @ 0x5
|
|
|
|
; CHECK-NEXT: .long 6 @ 0x6
|
|
|
|
; CHECK-NEXT: .long 7 @ 0x7
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
|
|
|
|
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <8 x i32> %induction, %broadcast.splat21
|
|
|
|
%1 = bitcast i16* %next.gep to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %0, <8 x i16> undef)
|
|
|
|
%2 = sext <8 x i16> %wide.masked.load to <8 x i32>
|
|
|
|
%3 = bitcast i16* %next.gep18 to <8 x i16>*
|
|
|
|
%wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %0, <8 x i16> undef)
|
|
|
|
%4 = sext <8 x i16> %wide.masked.load22 to <8 x i32>
|
|
|
|
%5 = mul nsw <8 x i32> %4, %2
|
|
|
|
%6 = ashr <8 x i32> %5, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%7 = icmp sgt <8 x i32> %6, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%8 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%9 = icmp slt <8 x i32> %8, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%11 = trunc <8 x i32> %10 to <8 x i16>
|
|
|
|
%12 = bitcast i16* %next.gep19 to <8 x i16>*
|
|
|
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %11, <8 x i16>* %12, i32 2, <8 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%13 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %13, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_8ti_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_8ti_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
|
|
; CHECK-NEXT: push {r4, lr}
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB10_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: add.w r12, r3, #7
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI10_0
|
|
|
|
; CHECK-NEXT: bic r12, r12, #7
|
|
|
|
; CHECK-NEXT: mov.w lr, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r12, #8
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adr r4, .LCPI10_1
|
|
|
|
; CHECK-NEXT: vmov.i8 q2, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, lr, r12, lsr #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: vdup.32 q1, r12
|
|
|
|
; CHECK-NEXT: vmov.i8 q3, #0xff
|
|
|
|
; CHECK-NEXT: .LBB10_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q6, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r3, #8
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vorr q5, q6, q0
|
|
|
|
; CHECK-NEXT: vorr q6, q6, q4
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q5
|
|
|
|
; CHECK-NEXT: vpsel q7, q3, q2
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q6
|
|
|
|
; CHECK-NEXT: vmov r4, s28
|
|
|
|
; CHECK-NEXT: vpsel q6, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s29
|
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s30
|
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s31
|
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s24
|
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s25
|
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s26
|
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s27
|
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r4
|
|
|
|
; CHECK-NEXT: vptt.i16 ne, q5, zr
|
|
|
|
; CHECK-NEXT: vldrht.u16 q5, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrht.u16 q6, [r1], #16
|
|
|
|
; CHECK-NEXT: vmullt.s16 q7, q6, q5
|
|
|
|
; CHECK-NEXT: vmullb.s16 q5, q6, q5
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s32 q5, q5, #15
|
|
|
|
; CHECK-NEXT: vqshrnt.s32 q5, q7, #15
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrht.16 q5, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB10_2
|
|
|
|
; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.4:
|
|
|
|
; CHECK-NEXT: .LCPI10_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
; CHECK-NEXT: .LCPI10_1:
|
|
|
|
; CHECK-NEXT: .long 4 @ 0x4
|
|
|
|
; CHECK-NEXT: .long 5 @ 0x5
|
|
|
|
; CHECK-NEXT: .long 6 @ 0x6
|
|
|
|
; CHECK-NEXT: .long 7 @ 0x7
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
|
|
|
|
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <8 x i32> %induction, %broadcast.splat21
|
|
|
|
%1 = bitcast i16* %next.gep to <8 x i16>*
|
|
|
|
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %0, <8 x i16> undef)
|
|
|
|
%2 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
|
|
%3 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
|
|
%4 = sext <4 x i16> %2 to <4 x i32>
|
|
|
|
%5 = sext <4 x i16> %3 to <4 x i32>
|
|
|
|
%6 = bitcast i16* %next.gep18 to <8 x i16>*
|
|
|
|
%wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %6, i32 2, <8 x i1> %0, <8 x i16> undef)
|
|
|
|
%7 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
|
|
%8 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
|
|
%9 = sext <4 x i16> %7 to <4 x i32>
|
|
|
|
%10 = sext <4 x i16> %8 to <4 x i32>
|
|
|
|
%11 = mul <4 x i32> %9, %4
|
|
|
|
%12 = mul <4 x i32> %10, %5
|
|
|
|
%13 = ashr <4 x i32> %11, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%14 = ashr <4 x i32> %12, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%15 = icmp sgt <4 x i32> %13, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%16 = icmp sgt <4 x i32> %14, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%17 = select <4 x i1> %15, <4 x i32> %13, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%18 = select <4 x i1> %16, <4 x i32> %14, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
|
|
%19 = icmp slt <4 x i32> %17, <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%20 = icmp slt <4 x i32> %18, <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%21 = select <4 x i1> %19, <4 x i32> %17, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%22 = select <4 x i1> %20, <4 x i32> %18, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
|
|
|
|
%23 = shufflevector <4 x i32> %21, <4 x i32> %22, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
|
|
|
|
%24 = trunc <8 x i32> %23 to <8 x i16>
|
|
|
|
%25 = bitcast i16* %next.gep19 to <8 x i16>*
|
|
|
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %24, <8 x i16>* %25, i32 2, <8 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%26 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %26, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @usatmul_4_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: usatmul_4_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB11_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #3
|
|
|
|
; CHECK-NEXT: bhi .LBB11_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB11_6
|
|
|
|
; CHECK-NEXT: .LBB11_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #3
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: subs r6, r5, #4
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #2
|
|
|
|
; CHECK-NEXT: add.w r4, r2, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r6, r1, r5, lsl #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB11_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrh.u32 q0, [r0], #8
|
|
|
|
; CHECK-NEXT: vldrh.u32 q1, [r1], #8
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrh.32 q0, [r2], #8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB11_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB11_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: movw r0, #65535
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB11_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldrh r1, [r12], #2
|
|
|
|
; CHECK-NEXT: ldrh r2, [r6], #2
|
|
|
|
; CHECK-NEXT: muls r1, r2, r1
|
|
|
|
; CHECK-NEXT: lsrs r2, r1, #15
|
|
|
|
; CHECK-NEXT: cmp r2, r0
|
|
|
|
; CHECK-NEXT: movw r2, #65535
|
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: lsrlo r2, r1, #15
|
|
|
|
; CHECK-NEXT: strh r2, [r4], #2
|
|
|
|
; CHECK-NEXT: le lr, .LBB11_7
|
|
|
|
; CHECK-NEXT: .LBB11_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -4
|
|
|
|
%ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = bitcast i16* %next.gep to <4 x i16>*
|
|
|
|
%wide.load = load <4 x i16>, <4 x i16>* %0, align 2
|
|
|
|
%1 = zext <4 x i16> %wide.load to <4 x i32>
|
|
|
|
%2 = bitcast i16* %next.gep18 to <4 x i16>*
|
|
|
|
%wide.load20 = load <4 x i16>, <4 x i16>* %2, align 2
|
|
|
|
%3 = zext <4 x i16> %wide.load20 to <4 x i32>
|
|
|
|
%4 = mul nuw <4 x i32> %3, %1
|
|
|
|
%5 = lshr <4 x i32> %4, <i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%6 = icmp ult <4 x i32> %5, <i32 65535, i32 65535, i32 65535, i32 65535>
|
|
|
|
%7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i16>
|
|
|
|
%9 = bitcast i16* %next.gep19 to <4 x i16>*
|
|
|
|
store <4 x i16> %8, <4 x i16>* %9, align 2
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
|
|
|
|
%11 = load i16, i16* %pSrcA.addr.011, align 2
|
|
|
|
%conv = zext i16 %11 to i32
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
|
|
|
|
%12 = load i16, i16* %pSrcB.addr.010, align 2
|
|
|
|
%conv2 = zext i16 %12 to i32
|
|
|
|
%mul = mul nuw i32 %conv2, %conv
|
|
|
|
%shr = lshr i32 %mul, 15
|
|
|
|
%13 = icmp ult i32 %shr, 65535
|
|
|
|
%retval.0.i = select i1 %13, i32 %shr, i32 65535
|
|
|
|
%conv3 = trunc i32 %retval.0.i to i16
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
|
|
|
|
store i16 %conv3, i16* %pDst.addr.09, align 2
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @usatmul_8_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: usatmul_8_q15:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB12_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #7
|
|
|
|
; CHECK-NEXT: bhi .LBB12_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB12_6
|
|
|
|
; CHECK-NEXT: .LBB12_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #7
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #8
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #3
|
|
|
|
; CHECK-NEXT: add.w r4, r2, r5, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r6, r1, r5, lsl #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB12_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrh.u32 q0, [r0, #8]
|
|
|
|
; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
|
|
|
; CHECK-NEXT: vldrh.u32 q1, [r1], #16
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrh.32 q0, [r2, #8]
|
|
|
|
; CHECK-NEXT: vldrh.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrh.32 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB12_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB12_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: movw r0, #65535
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB12_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldrh r1, [r12], #2
|
|
|
|
; CHECK-NEXT: ldrh r2, [r6], #2
|
|
|
|
; CHECK-NEXT: muls r1, r2, r1
|
|
|
|
; CHECK-NEXT: lsrs r2, r1, #15
|
|
|
|
; CHECK-NEXT: cmp r2, r0
|
|
|
|
; CHECK-NEXT: movw r2, #65535
|
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: lsrlo r2, r1, #15
|
|
|
|
; CHECK-NEXT: strh r2, [r4], #2
|
|
|
|
; CHECK-NEXT: le lr, .LBB12_7
|
|
|
|
; CHECK-NEXT: .LBB12_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 8
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -8
|
|
|
|
%ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i16, i16* %pDst, i32 %index
|
|
|
|
%0 = bitcast i16* %next.gep to <8 x i16>*
|
|
|
|
%wide.load = load <8 x i16>, <8 x i16>* %0, align 2
|
|
|
|
%1 = zext <8 x i16> %wide.load to <8 x i32>
|
|
|
|
%2 = bitcast i16* %next.gep18 to <8 x i16>*
|
|
|
|
%wide.load20 = load <8 x i16>, <8 x i16>* %2, align 2
|
|
|
|
%3 = zext <8 x i16> %wide.load20 to <8 x i32>
|
|
|
|
%4 = mul nuw <8 x i32> %3, %1
|
|
|
|
%5 = lshr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
|
|
|
|
%6 = icmp ult <8 x i32> %5, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
|
|
|
|
%7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i16>
|
|
|
|
%9 = bitcast i16* %next.gep19 to <8 x i16>*
|
|
|
|
store <8 x i16> %8, <8 x i16>* %9, align 2
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
|
|
|
|
%11 = load i16, i16* %pSrcA.addr.011, align 2
|
|
|
|
%conv = zext i16 %11 to i32
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
|
|
|
|
%12 = load i16, i16* %pSrcB.addr.010, align 2
|
|
|
|
%conv2 = zext i16 %12 to i32
|
|
|
|
%mul = mul nuw i32 %conv2, %conv
|
|
|
|
%shr = lshr i32 %mul, 15
|
|
|
|
%13 = icmp ult i32 %shr, 65535
|
|
|
|
%retval.0.i = select i1 %13, i32 %shr, i32 65535
|
|
|
|
%conv3 = trunc i32 %retval.0.i to i16
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
|
|
|
|
store i16 %conv3, i16* %pDst.addr.09, align 2
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
; i8
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_4_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB13_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #3
|
|
|
|
; CHECK-NEXT: bhi .LBB13_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB13_6
|
|
|
|
; CHECK-NEXT: .LBB13_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #3
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: subs r6, r5, #4
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5
|
|
|
|
; CHECK-NEXT: vmvn.i32 q0, #0x7f
|
|
|
|
; CHECK-NEXT: vmov.i32 q1, #0x7f
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #2
|
|
|
|
; CHECK-NEXT: adds r4, r2, r5
|
|
|
|
; CHECK-NEXT: adds r6, r1, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB13_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrb.s32 q2, [r0], #4
|
|
|
|
; CHECK-NEXT: vldrb.s32 q3, [r1], #4
|
|
|
|
; CHECK-NEXT: vmul.i32 q2, q3, q2
|
|
|
|
; CHECK-NEXT: vshr.s32 q2, q2, #7
|
|
|
|
; CHECK-NEXT: vmax.s32 q2, q2, q0
|
|
|
|
; CHECK-NEXT: vmin.s32 q2, q2, q1
|
|
|
|
; CHECK-NEXT: vstrb.32 q2, [r2], #4
|
|
|
|
; CHECK-NEXT: le lr, .LBB13_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB13_6: @ %for.body.preheader21
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB13_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldrsb r0, [r12], #1
|
|
|
|
; CHECK-NEXT: ldrsb r1, [r6], #1
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #8, r0, asr #7
|
2020-09-14 18:57:41 +08:00
|
|
|
; CHECK-NEXT: strb r0, [r4], #1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB13_7
|
|
|
|
; CHECK-NEXT: .LBB13_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 4
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader21: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.011.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.010.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
|
|
|
|
%pDst.addr.09.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -4
|
|
|
|
%ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end15 = getelementptr i8, i8* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i8, i8* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep18 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep19 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = bitcast i8* %next.gep to <4 x i8>*
|
|
|
|
%wide.load = load <4 x i8>, <4 x i8>* %0, align 1
|
|
|
|
%1 = sext <4 x i8> %wide.load to <4 x i32>
|
|
|
|
%2 = bitcast i8* %next.gep18 to <4 x i8>*
|
|
|
|
%wide.load20 = load <4 x i8>, <4 x i8>* %2, align 1
|
|
|
|
%3 = sext <4 x i8> %wide.load20 to <4 x i32>
|
|
|
|
%4 = mul nsw <4 x i32> %3, %1
|
|
|
|
%5 = ashr <4 x i32> %4, <i32 7, i32 7, i32 7, i32 7>
|
|
|
|
%6 = icmp sgt <4 x i32> %5, <i32 -128, i32 -128, i32 -128, i32 -128>
|
|
|
|
%7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>
|
|
|
|
%8 = icmp slt <4 x i32> %7, <i32 127, i32 127, i32 127, i32 127>
|
|
|
|
%9 = select <4 x i1> %8, <4 x i32> %7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
|
|
|
|
%10 = trunc <4 x i32> %9 to <4 x i8>
|
|
|
|
%11 = bitcast i8* %next.gep19 to <4 x i8>*
|
|
|
|
store <4 x i8> %10, <4 x i8>* %11, align 1
|
|
|
|
%index.next = add i32 %index, 4
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader21, %for.body
|
|
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcA.addr.011 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
|
|
|
|
%pSrcB.addr.010 = phi i8* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
|
|
|
|
%pDst.addr.09 = phi i8* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
|
|
|
|
%13 = load i8, i8* %pSrcA.addr.011, align 1
|
|
|
|
%conv = sext i8 %13 to i32
|
|
|
|
%incdec.ptr1 = getelementptr inbounds i8, i8* %pSrcB.addr.010, i32 1
|
|
|
|
%14 = load i8, i8* %pSrcB.addr.010, align 1
|
|
|
|
%conv2 = sext i8 %14 to i32
|
|
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
|
|
%shr = ashr i32 %mul, 7
|
|
|
|
%15 = icmp sgt i32 %shr, -128
|
|
|
|
%.val.i = select i1 %15, i32 %shr, i32 -128
|
|
|
|
%16 = icmp slt i32 %.val.i, 127
|
|
|
|
%retval.0.i = select i1 %16, i32 %.val.i, i32 127
|
|
|
|
%conv3 = trunc i32 %retval.0.i to i8
|
|
|
|
%incdec.ptr4 = getelementptr inbounds i8, i8* %pDst.addr.09, i32 1
|
|
|
|
store i8 %conv3, i8* %pDst.addr.09, align 1
|
|
|
|
%inc = add nuw i32 %i.012, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_8_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_8_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: cbz r3, .LBB14_8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #7
|
|
|
|
; CHECK-NEXT: bhi .LBB14_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB14_6
|
|
|
|
; CHECK-NEXT: .LBB14_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #7
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #8
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #3
|
|
|
|
; CHECK-NEXT: adds r4, r2, r5
|
|
|
|
; CHECK-NEXT: adds r6, r1, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB14_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrb.s16 q0, [r0], #8
|
|
|
|
; CHECK-NEXT: vldrb.s16 q1, [r1], #8
|
|
|
|
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrb.16 q0, [r2], #8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB14_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB14_6: @ %for.body.preheader23
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB14_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: ldrsb r0, [r12], #1
|
|
|
|
; CHECK-NEXT: ldrsb r1, [r6], #1
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #8, r0, asr #7
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: strb r0, [r4], #1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB14_7
|
|
|
|
; CHECK-NEXT: .LBB14_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 8
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader23: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
%pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -8
|
|
|
|
%ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = bitcast i8* %next.gep to <8 x i8>*
|
|
|
|
%wide.load = load <8 x i8>, <8 x i8>* %0, align 1
|
|
|
|
%1 = sext <8 x i8> %wide.load to <8 x i16>
|
|
|
|
%2 = bitcast i8* %next.gep20 to <8 x i8>*
|
|
|
|
%wide.load22 = load <8 x i8>, <8 x i8>* %2, align 1
|
|
|
|
%3 = sext <8 x i8> %wide.load22 to <8 x i16>
|
|
|
|
%4 = mul nsw <8 x i16> %3, %1
|
|
|
|
%5 = ashr <8 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%6 = icmp sgt <8 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%8 = icmp slt <8 x i16> %7, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%9 = select <8 x i1> %8, <8 x i16> %7, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%10 = trunc <8 x i16> %9 to <8 x i8>
|
|
|
|
%11 = bitcast i8* %next.gep21 to <8 x i8>*
|
|
|
|
store <8 x i8> %10, <8 x i8>* %11, align 1
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader23, %for.body
|
|
|
|
%i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
|
|
|
|
%pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
|
|
|
|
%13 = load i8, i8* %pSrcA.addr.013, align 1
|
|
|
|
%conv1 = sext i8 %13 to i16
|
|
|
|
%incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
|
|
|
|
%14 = load i8, i8* %pSrcB.addr.012, align 1
|
|
|
|
%conv3 = sext i8 %14 to i16
|
|
|
|
%mul = mul nsw i16 %conv3, %conv1
|
|
|
|
%shr = ashr i16 %mul, 7
|
|
|
|
%15 = icmp sgt i16 %shr, -128
|
|
|
|
%.val.i = select i1 %15, i16 %shr, i16 -128
|
|
|
|
%16 = icmp slt i16 %.val.i, 127
|
|
|
|
%retval.0.i = select i1 %16, i16 %.val.i, i16 127
|
|
|
|
%conv5 = trunc i16 %retval.0.i to i8
|
|
|
|
%incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
|
|
|
|
store i8 %conv5, i8* %pDst.addr.011, align 1
|
|
|
|
%inc = add nuw i32 %i.014, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_16_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB15_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #15
|
|
|
|
; CHECK-NEXT: bhi .LBB15_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB15_6
|
|
|
|
; CHECK-NEXT: .LBB15_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #15
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #16
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #4
|
|
|
|
; CHECK-NEXT: adds r4, r2, r5
|
|
|
|
; CHECK-NEXT: adds r6, r1, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB15_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrb.s16 q0, [r0, #8]
|
|
|
|
; CHECK-NEXT: vldrb.s16 q1, [r1, #8]
|
|
|
|
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
|
|
|
; CHECK-NEXT: vldrb.s16 q1, [r1], #16
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrb.16 q0, [r2, #8]
|
|
|
|
; CHECK-NEXT: vldrb.s16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrb.16 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB15_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB15_6: @ %for.body.preheader23
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB15_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: ldrsb r0, [r12], #1
|
|
|
|
; CHECK-NEXT: ldrsb r1, [r6], #1
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #8, r0, asr #7
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: strb r0, [r4], #1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB15_7
|
|
|
|
; CHECK-NEXT: .LBB15_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 16
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader23: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
%pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -16
|
|
|
|
%ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = bitcast i8* %next.gep to <16 x i8>*
|
|
|
|
%wide.load = load <16 x i8>, <16 x i8>* %0, align 1
|
|
|
|
%1 = sext <16 x i8> %wide.load to <16 x i16>
|
|
|
|
%2 = bitcast i8* %next.gep20 to <16 x i8>*
|
|
|
|
%wide.load22 = load <16 x i8>, <16 x i8>* %2, align 1
|
|
|
|
%3 = sext <16 x i8> %wide.load22 to <16 x i16>
|
|
|
|
%4 = mul nsw <16 x i16> %3, %1
|
|
|
|
%5 = ashr <16 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%8 = icmp slt <16 x i16> %7, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%9 = select <16 x i1> %8, <16 x i16> %7, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%10 = trunc <16 x i16> %9 to <16 x i8>
|
|
|
|
%11 = bitcast i8* %next.gep21 to <16 x i8>*
|
|
|
|
store <16 x i8> %10, <16 x i8>* %11, align 1
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%12 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %12, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader23, %for.body
|
|
|
|
%i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
|
|
|
|
%pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
|
|
|
|
%13 = load i8, i8* %pSrcA.addr.013, align 1
|
|
|
|
%conv1 = sext i8 %13 to i16
|
|
|
|
%incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
|
|
|
|
%14 = load i8, i8* %pSrcB.addr.012, align 1
|
|
|
|
%conv3 = sext i8 %14 to i16
|
|
|
|
%mul = mul nsw i16 %conv3, %conv1
|
|
|
|
%shr = ashr i16 %mul, 7
|
|
|
|
%15 = icmp sgt i16 %shr, -128
|
|
|
|
%.val.i = select i1 %15, i16 %shr, i16 -128
|
|
|
|
%16 = icmp slt i16 %.val.i, 127
|
|
|
|
%retval.0.i = select i1 %16, i16 %.val.i, i16 127
|
|
|
|
%conv5 = trunc i16 %retval.0.i to i8
|
|
|
|
%incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
|
|
|
|
store i8 %conv5, i8* %pDst.addr.011, align 1
|
|
|
|
%inc = add nuw i32 %i.014, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_16i_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_16i_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: cbz r3, .LBB16_8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #15
|
|
|
|
; CHECK-NEXT: bhi .LBB16_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB16_6
|
|
|
|
; CHECK-NEXT: .LBB16_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #15
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #16
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #4
|
|
|
|
; CHECK-NEXT: adds r4, r2, r5
|
|
|
|
; CHECK-NEXT: adds r6, r1, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB16_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
|
|
|
|
; CHECK-NEXT: vmullt.s8 q2, q1, q0
|
|
|
|
; CHECK-NEXT: vmullb.s8 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
|
|
|
|
; CHECK-NEXT: vqshrnt.s16 q0, q2, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB16_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB16_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: ldrsb r0, [r12], #1
|
|
|
|
; CHECK-NEXT: ldrsb r1, [r6], #1
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
2020-09-28 22:50:19 +08:00
|
|
|
; CHECK-NEXT: ssat r0, #8, r0, asr #7
|
2020-09-22 19:54:10 +08:00
|
|
|
; CHECK-NEXT: strb r0, [r4], #1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB16_7
|
|
|
|
; CHECK-NEXT: .LBB16_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 16
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader23: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
%pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -16
|
|
|
|
%ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = bitcast i8* %next.gep to <16 x i8>*
|
|
|
|
%wide.load = load <16 x i8>, <16 x i8>* %0, align 1
|
|
|
|
%1 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
|
|
%2 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
|
|
%3 = sext <8 x i8> %1 to <8 x i16>
|
|
|
|
%4 = sext <8 x i8> %2 to <8 x i16>
|
|
|
|
%5 = bitcast i8* %next.gep20 to <16 x i8>*
|
|
|
|
%wide.load22 = load <16 x i8>, <16 x i8>* %5, align 1
|
|
|
|
%6 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
|
|
%7 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
|
|
%8 = sext <8 x i8> %6 to <8 x i16>
|
|
|
|
%9 = sext <8 x i8> %7 to <8 x i16>
|
|
|
|
%10 = mul <8 x i16> %8, %3
|
|
|
|
%11 = mul <8 x i16> %9, %4
|
|
|
|
%12 = ashr <8 x i16> %10, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%13 = ashr <8 x i16> %11, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%14 = icmp sgt <8 x i16> %12, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%15 = icmp sgt <8 x i16> %13, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%16 = select <8 x i1> %14, <8 x i16> %12, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%17 = select <8 x i1> %15, <8 x i16> %13, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%18 = icmp slt <8 x i16> %16, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%19 = icmp slt <8 x i16> %17, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%20 = select <8 x i1> %18, <8 x i16> %16, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%21 = select <8 x i1> %19, <8 x i16> %17, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%22 = shufflevector <8 x i16> %20, <8 x i16> %21, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
|
|
|
|
%23 = trunc <16 x i16> %22 to <16 x i8>
|
|
|
|
%24 = bitcast i8* %next.gep21 to <16 x i8>*
|
|
|
|
store <16 x i8> %23, <16 x i8>* %24, align 1
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%25 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %25, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader23
|
|
|
|
%i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
|
|
|
|
%pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
|
|
|
|
%26 = load i8, i8* %pSrcA.addr.013, align 1
|
|
|
|
%conv1 = sext i8 %26 to i16
|
|
|
|
%incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
|
|
|
|
%27 = load i8, i8* %pSrcB.addr.012, align 1
|
|
|
|
%conv3 = sext i8 %27 to i16
|
|
|
|
%mul = mul nsw i16 %conv3, %conv1
|
|
|
|
%shr = ashr i16 %mul, 7
|
|
|
|
%28 = icmp sgt i16 %shr, -128
|
|
|
|
%.val.i = select i1 %28, i16 %shr, i16 -128
|
|
|
|
%29 = icmp slt i16 %.val.i, 127
|
|
|
|
%retval.0.i = select i1 %29, i16 %.val.i, i16 127
|
|
|
|
%conv5 = trunc i16 %retval.0.i to i8
|
|
|
|
%incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
|
|
|
|
store i8 %conv5, i8* %pDst.addr.011, align 1
|
|
|
|
%inc = add nuw i32 %i.014, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_8t_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_8t_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
|
|
; CHECK-NEXT: push {r4, lr}
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB17_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: add.w r12, r3, #7
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI17_0
|
|
|
|
; CHECK-NEXT: bic r12, r12, #7
|
|
|
|
; CHECK-NEXT: mov.w lr, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r12, #8
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adr r4, .LCPI17_1
|
|
|
|
; CHECK-NEXT: vmov.i8 q2, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, lr, r12, lsr #3
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 23:57:58 +08:00
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
|
|
|
; CHECK-NEXT: vdup.32 q1, r12
|
|
|
|
; CHECK-NEXT: vmov.i8 q3, #0xff
|
|
|
|
; CHECK-NEXT: .LBB17_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q6, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r3, #8
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vorr q5, q6, q0
|
|
|
|
; CHECK-NEXT: vorr q6, q6, q4
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q5
|
|
|
|
; CHECK-NEXT: vpsel q7, q3, q2
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q6
|
|
|
|
; CHECK-NEXT: vmov r4, s28
|
|
|
|
; CHECK-NEXT: vpsel q6, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s29
|
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s30
|
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s31
|
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s24
|
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s25
|
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s26
|
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s27
|
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r4
|
|
|
|
; CHECK-NEXT: vptt.i16 ne, q5, zr
|
|
|
|
; CHECK-NEXT: vldrbt.s16 q5, [r0], #8
|
|
|
|
; CHECK-NEXT: vldrbt.s16 q6, [r1], #8
|
|
|
|
; CHECK-NEXT: vmul.i16 q5, q6, q5
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q5, q5, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.s8 q5, q5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrbt.16 q5, [r2], #8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB17_2
|
|
|
|
; CHECK-NEXT: .LBB17_3: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.4:
|
|
|
|
; CHECK-NEXT: .LCPI17_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
; CHECK-NEXT: .LCPI17_1:
|
|
|
|
; CHECK-NEXT: .long 4 @ 0x4
|
|
|
|
; CHECK-NEXT: .long 5 @ 0x5
|
|
|
|
; CHECK-NEXT: .long 6 @ 0x6
|
|
|
|
; CHECK-NEXT: .long 7 @ 0x7
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 7
|
|
|
|
%n.vec = and i32 %n.rnd.up, -8
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
|
|
|
|
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <8 x i32> %induction, %broadcast.splat23
|
|
|
|
%1 = bitcast i8* %next.gep to <8 x i8>*
|
|
|
|
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %1, i32 1, <8 x i1> %0, <8 x i8> undef)
|
|
|
|
%2 = sext <8 x i8> %wide.masked.load to <8 x i16>
|
|
|
|
%3 = bitcast i8* %next.gep20 to <8 x i8>*
|
|
|
|
%wide.masked.load24 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %3, i32 1, <8 x i1> %0, <8 x i8> undef)
|
|
|
|
%4 = sext <8 x i8> %wide.masked.load24 to <8 x i16>
|
|
|
|
%5 = mul nsw <8 x i16> %4, %2
|
|
|
|
%6 = ashr <8 x i16> %5, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%7 = icmp sgt <8 x i16> %6, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%9 = icmp slt <8 x i16> %8, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%11 = trunc <8 x i16> %10 to <8 x i8>
|
|
|
|
%12 = bitcast i8* %next.gep21 to <8 x i8>*
|
|
|
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %11, <8 x i8>* %12, i32 1, <8 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%13 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %13, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_16t_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
|
|
; CHECK-NEXT: push {r4, lr}
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: .pad #56
|
|
|
|
; CHECK-NEXT: sub sp, #56
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB18_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: add.w r12, r3, #15
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI18_0
|
|
|
|
; CHECK-NEXT: bic r12, r12, #15
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: sub.w r12, r12, #16
|
|
|
|
; CHECK-NEXT: mov.w lr, #1
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI18_1
|
|
|
|
; CHECK-NEXT: vmov.i8 q2, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, lr, r12, lsr #4
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI18_2
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: movs r3, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q1, r12
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI18_3
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: vmov.i8 q3, #0xff
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q6, [r4]
|
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB18_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vdup.32 q4, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r3, #16
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vorr q0, q4, q0
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpsel q5, q3, q2
|
|
|
|
; CHECK-NEXT: vmov r4, s20
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q0[0], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s21
|
|
|
|
; CHECK-NEXT: vmov.16 q0[1], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s22
|
|
|
|
; CHECK-NEXT: vmov.16 q0[2], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s23
|
|
|
|
; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vmov.16 q0[3], r4
|
|
|
|
; CHECK-NEXT: vorr q5, q4, q5
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q5
|
|
|
|
; CHECK-NEXT: vpsel q5, q3, q2
|
|
|
|
; CHECK-NEXT: vmov r4, s20
|
|
|
|
; CHECK-NEXT: vmov.16 q0[4], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s21
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q0[5], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s22
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q0[6], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s23
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q0[7], r4
|
|
|
|
; CHECK-NEXT: vcmp.i16 ne, q0, zr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpsel q5, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[0]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[0], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[1]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[1], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[2]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[2], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[3]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[3], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[4]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[4], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[5]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[5], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[6]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q0[6], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[7]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vmov.8 q0[7], r4
|
|
|
|
; CHECK-NEXT: vorr q5, q4, q5
|
|
|
|
; CHECK-NEXT: vorr q4, q4, q6
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q5
|
|
|
|
; CHECK-NEXT: vpsel q7, q3, q2
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q4
|
|
|
|
; CHECK-NEXT: vmov r4, s28
|
|
|
|
; CHECK-NEXT: vpsel q4, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s29
|
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s30
|
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s31
|
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s16
|
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s17
|
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s18
|
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s19
|
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r4
|
|
|
|
; CHECK-NEXT: vcmp.i16 ne, q5, zr
|
|
|
|
; CHECK-NEXT: vpsel q4, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[0]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[8], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[1]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[9], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[2]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[10], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[3]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[11], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[4]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[12], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[5]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[13], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[6]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[14], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[7]
|
|
|
|
; CHECK-NEXT: vmov.8 q0[15], r4
|
|
|
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
|
|
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[0]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[0], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[1]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[1], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[2]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[2], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[3]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[3], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[4], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[5]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[5], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[6]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[6], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[7]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q7[7], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vpst
|
|
|
|
; CHECK-NEXT: vldrbt.u8 q4, [r1], #16
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[0]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[1]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[2]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[3]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[5]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[6]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[7]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r4
|
|
|
|
; CHECK-NEXT: vmullb.s8 q5, q5, q7
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q5, q5, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.s8 q5, q5
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[0]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[0], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[1]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[1], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[2]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[2], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[3]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[3], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[4]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[4], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[5]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[5], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[6]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[6], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.u16 r4, q5[7]
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov.8 q7[7], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[8]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[0], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[9]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[1], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[10]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[2], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[11]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[3], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[12]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[4], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[13]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[5], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[14]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[6], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q0[15]
|
|
|
|
; CHECK-NEXT: vmov.16 q5[7], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[8]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[0], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[9]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[1], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[10]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[2], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[11]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[3], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[12]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[4], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[13]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[5], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[14]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[6], r4
|
|
|
|
; CHECK-NEXT: vmov.u8 r4, q4[15]
|
|
|
|
; CHECK-NEXT: vmov.16 q0[7], r4
|
|
|
|
; CHECK-NEXT: vmullb.s8 q0, q0, q5
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.s8 q0, q0
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[0]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[8], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[1]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[9], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[2]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[10], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[3]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[11], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[4]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[12], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[5]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[13], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[6]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[14], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[7]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[15], r4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrbt.8 q7, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB18_2
|
|
|
|
; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: add sp, #56
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.4:
|
|
|
|
; CHECK-NEXT: .LCPI18_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
; CHECK-NEXT: .LCPI18_1:
|
|
|
|
; CHECK-NEXT: .long 4 @ 0x4
|
|
|
|
; CHECK-NEXT: .long 5 @ 0x5
|
|
|
|
; CHECK-NEXT: .long 6 @ 0x6
|
|
|
|
; CHECK-NEXT: .long 7 @ 0x7
|
|
|
|
; CHECK-NEXT: .LCPI18_2:
|
|
|
|
; CHECK-NEXT: .long 8 @ 0x8
|
|
|
|
; CHECK-NEXT: .long 9 @ 0x9
|
|
|
|
; CHECK-NEXT: .long 10 @ 0xa
|
|
|
|
; CHECK-NEXT: .long 11 @ 0xb
|
|
|
|
; CHECK-NEXT: .LCPI18_3:
|
|
|
|
; CHECK-NEXT: .long 12 @ 0xc
|
|
|
|
; CHECK-NEXT: .long 13 @ 0xd
|
|
|
|
; CHECK-NEXT: .long 14 @ 0xe
|
|
|
|
; CHECK-NEXT: .long 15 @ 0xf
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
|
|
%induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <16 x i32> %induction, %broadcast.splat23
|
|
|
|
%1 = bitcast i8* %next.gep to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %0, <16 x i8> undef)
|
|
|
|
%2 = sext <16 x i8> %wide.masked.load to <16 x i16>
|
|
|
|
%3 = bitcast i8* %next.gep20 to <16 x i8>*
|
|
|
|
%wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %0, <16 x i8> undef)
|
|
|
|
%4 = sext <16 x i8> %wide.masked.load24 to <16 x i16>
|
|
|
|
%5 = mul nsw <16 x i16> %4, %2
|
|
|
|
%6 = ashr <16 x i16> %5, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%7 = icmp sgt <16 x i16> %6, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%9 = icmp slt <16 x i16> %8, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%11 = trunc <16 x i16> %10 to <16 x i8>
|
|
|
|
%12 = bitcast i8* %next.gep21 to <16 x i8>*
|
|
|
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %11, <16 x i8>* %12, i32 1, <16 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%13 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %13, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: ssatmul_16ti_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
|
|
; CHECK-NEXT: push {r4, lr}
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: .pad #56
|
|
|
|
; CHECK-NEXT: sub sp, #56
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq.w .LBB19_3
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %vector.ph
|
|
|
|
; CHECK-NEXT: add.w r12, r3, #15
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI19_0
|
|
|
|
; CHECK-NEXT: bic r12, r12, #15
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: sub.w r12, r12, #16
|
|
|
|
; CHECK-NEXT: mov.w lr, #1
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI19_1
|
|
|
|
; CHECK-NEXT: vmov.i8 q2, #0x0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: add.w lr, lr, r12, lsr #4
|
|
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI19_2
|
|
|
|
; CHECK-NEXT: movs r3, #0
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: vdup.32 q1, r12
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
|
|
|
; CHECK-NEXT: adr r4, .LCPI19_3
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: vmov.i8 q3, #0xff
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q6, [r4]
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
|
|
|
|
; CHECK-NEXT: .LBB19_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vdup.32 q0, r3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: adds r3, #16
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vorr q4, q0, q4
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vpsel q4, q3, q2
|
|
|
|
; CHECK-NEXT: vmov r4, s16
|
|
|
|
; CHECK-NEXT: vmov.16 q7[0], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s17
|
|
|
|
; CHECK-NEXT: vmov.16 q7[1], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s18
|
|
|
|
; CHECK-NEXT: vmov.16 q7[2], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s19
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vmov.16 q7[3], r4
|
|
|
|
; CHECK-NEXT: vorr q4, q0, q4
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q4
|
|
|
|
; CHECK-NEXT: vpsel q4, q3, q2
|
|
|
|
; CHECK-NEXT: vmov r4, s16
|
|
|
|
; CHECK-NEXT: vmov.16 q7[4], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s17
|
|
|
|
; CHECK-NEXT: vmov.16 q7[5], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s18
|
|
|
|
; CHECK-NEXT: vmov.16 q7[6], r4
|
|
|
|
; CHECK-NEXT: vmov r4, s19
|
|
|
|
; CHECK-NEXT: vmov.16 q7[7], r4
|
|
|
|
; CHECK-NEXT: vcmp.i16 ne, q7, zr
|
|
|
|
; CHECK-NEXT: vpsel q4, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[0]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[0], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[1]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[1], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[2]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[2], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[3]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[3], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[4]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[4], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[5]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[5], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[6]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[6], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q4[7]
|
|
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
|
|
|
|
; CHECK-NEXT: vmov.8 q7[7], r4
|
|
|
|
; CHECK-NEXT: vorr q4, q0, q4
|
|
|
|
; CHECK-NEXT: vorr q0, q0, q6
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q4
|
|
|
|
; CHECK-NEXT: vpsel q5, q3, q2
|
|
|
|
; CHECK-NEXT: vcmp.u32 cs, q1, q0
|
|
|
|
; CHECK-NEXT: vmov r4, s20
|
|
|
|
; CHECK-NEXT: vpsel q0, q3, q2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[0], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s21
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[1], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s22
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[2], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s23
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[3], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s0
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[4], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s1
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[5], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s2
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[6], r4
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmov r4, s3
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vmov.16 q4[7], r4
|
|
|
|
; CHECK-NEXT: vcmp.i16 ne, q4, zr
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vpsel q0, q3, q2
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[0]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[8], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[1]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[9], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[2]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[10], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[3]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[11], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[4]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[12], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[5]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[13], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[6]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[14], r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q0[7]
|
|
|
|
; CHECK-NEXT: vmov.8 q7[15], r4
|
|
|
|
; CHECK-NEXT: vptt.i8 ne, q7, zr
|
|
|
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrbt.u8 q4, [r1], #16
|
|
|
|
; CHECK-NEXT: vmullt.s8 q5, q4, q0
|
|
|
|
; CHECK-NEXT: vmullb.s8 q0, q4, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
|
|
|
|
; CHECK-NEXT: vqshrnt.s16 q0, q5, #7
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrbt.8 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB19_2
|
|
|
|
; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: add sp, #56
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
|
|
; CHECK-NEXT: .p2align 4
|
|
|
|
; CHECK-NEXT: @ %bb.4:
|
|
|
|
; CHECK-NEXT: .LCPI19_0:
|
|
|
|
; CHECK-NEXT: .long 0 @ 0x0
|
|
|
|
; CHECK-NEXT: .long 1 @ 0x1
|
|
|
|
; CHECK-NEXT: .long 2 @ 0x2
|
|
|
|
; CHECK-NEXT: .long 3 @ 0x3
|
|
|
|
; CHECK-NEXT: .LCPI19_1:
|
|
|
|
; CHECK-NEXT: .long 4 @ 0x4
|
|
|
|
; CHECK-NEXT: .long 5 @ 0x5
|
|
|
|
; CHECK-NEXT: .long 6 @ 0x6
|
|
|
|
; CHECK-NEXT: .long 7 @ 0x7
|
|
|
|
; CHECK-NEXT: .LCPI19_2:
|
|
|
|
; CHECK-NEXT: .long 8 @ 0x8
|
|
|
|
; CHECK-NEXT: .long 9 @ 0x9
|
|
|
|
; CHECK-NEXT: .long 10 @ 0xa
|
|
|
|
; CHECK-NEXT: .long 11 @ 0xb
|
|
|
|
; CHECK-NEXT: .LCPI19_3:
|
|
|
|
; CHECK-NEXT: .long 12 @ 0xc
|
|
|
|
; CHECK-NEXT: .long 13 @ 0xd
|
|
|
|
; CHECK-NEXT: .long 14 @ 0xe
|
|
|
|
; CHECK-NEXT: .long 15 @ 0xf
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%n.rnd.up = add i32 %N, 15
|
|
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
|
|
%broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
|
|
%broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
|
|
|
|
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
|
|
%induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = icmp ule <16 x i32> %induction, %broadcast.splat23
|
|
|
|
%1 = bitcast i8* %next.gep to <16 x i8>*
|
|
|
|
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %0, <16 x i8> undef)
|
|
|
|
%2 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
|
|
%3 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
|
|
%4 = sext <8 x i8> %2 to <8 x i16>
|
|
|
|
%5 = sext <8 x i8> %3 to <8 x i16>
|
|
|
|
%6 = bitcast i8* %next.gep20 to <16 x i8>*
|
|
|
|
%wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %6, i32 1, <16 x i1> %0, <16 x i8> undef)
|
|
|
|
%7 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
|
|
%8 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
|
|
%9 = sext <8 x i8> %7 to <8 x i16>
|
|
|
|
%10 = sext <8 x i8> %8 to <8 x i16>
|
|
|
|
%11 = mul <8 x i16> %9, %4
|
|
|
|
%12 = mul <8 x i16> %10, %5
|
|
|
|
%13 = ashr <8 x i16> %11, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%14 = ashr <8 x i16> %12, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%15 = icmp sgt <8 x i16> %13, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%16 = icmp sgt <8 x i16> %14, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%17 = select <8 x i1> %15, <8 x i16> %13, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%18 = select <8 x i1> %16, <8 x i16> %14, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
|
|
|
|
%19 = icmp slt <8 x i16> %17, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%20 = icmp slt <8 x i16> %18, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%21 = select <8 x i1> %19, <8 x i16> %17, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%22 = select <8 x i1> %20, <8 x i16> %18, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
|
|
|
|
%23 = shufflevector <8 x i16> %21, <8 x i16> %22, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
|
|
|
|
%24 = trunc <16 x i16> %23 to <16 x i8>
|
|
|
|
%25 = bitcast i8* %next.gep21 to <16 x i8>*
|
|
|
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %24, <16 x i8>* %25, i32 1, <16 x i1> %0)
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%26 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %26, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @usatmul_8_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: usatmul_8_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
2020-12-10 20:14:23 +08:00
|
|
|
; CHECK-NEXT: cbz r3, .LBB20_8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #7
|
|
|
|
; CHECK-NEXT: bhi .LBB20_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB20_6
|
|
|
|
; CHECK-NEXT: .LBB20_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #7
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #8
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #3
|
|
|
|
; CHECK-NEXT: adds r4, r2, r5
|
|
|
|
; CHECK-NEXT: adds r6, r1, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB20_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrb.u16 q0, [r0], #8
|
|
|
|
; CHECK-NEXT: vldrb.u16 q1, [r1], #8
|
|
|
|
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vstrb.16 q0, [r2], #8
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB20_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB20_6: @ %for.body.preheader23
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB20_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldrb r0, [r12], #1
|
|
|
|
; CHECK-NEXT: ldrb r1, [r6], #1
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
|
|
|
; CHECK-NEXT: lsrs r1, r0, #7
|
|
|
|
; CHECK-NEXT: cmp r1, #255
|
|
|
|
; CHECK-NEXT: mov.w r1, #255
|
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: lsrlo r1, r0, #7
|
|
|
|
; CHECK-NEXT: strb r1, [r4], #1
|
|
|
|
; CHECK-NEXT: le lr, .LBB20_7
|
|
|
|
; CHECK-NEXT: .LBB20_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 8
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader23: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
%pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -8
|
|
|
|
%ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = bitcast i8* %next.gep to <8 x i8>*
|
|
|
|
%wide.load = load <8 x i8>, <8 x i8>* %0, align 1
|
|
|
|
%1 = zext <8 x i8> %wide.load to <8 x i16>
|
|
|
|
%2 = bitcast i8* %next.gep20 to <8 x i8>*
|
|
|
|
%wide.load22 = load <8 x i8>, <8 x i8>* %2, align 1
|
|
|
|
%3 = zext <8 x i8> %wide.load22 to <8 x i16>
|
|
|
|
%4 = mul nuw <8 x i16> %3, %1
|
|
|
|
%5 = lshr <8 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%6 = icmp ult <8 x i16> %5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
|
|
%7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
|
|
%8 = trunc <8 x i16> %7 to <8 x i8>
|
|
|
|
%9 = bitcast i8* %next.gep21 to <8 x i8>*
|
|
|
|
store <8 x i8> %8, <8 x i8>* %9, align 1
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader23, %for.body
|
|
|
|
%i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
|
|
|
|
%pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
|
|
|
|
%11 = load i8, i8* %pSrcA.addr.013, align 1
|
|
|
|
%conv1 = zext i8 %11 to i16
|
|
|
|
%incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
|
|
|
|
%12 = load i8, i8* %pSrcB.addr.012, align 1
|
|
|
|
%conv3 = zext i8 %12 to i16
|
|
|
|
%mul = mul nuw i16 %conv3, %conv1
|
|
|
|
%13 = lshr i16 %mul, 7
|
|
|
|
%14 = icmp ult i16 %13, 255
|
|
|
|
%retval.0.i = select i1 %14, i16 %13, i16 255
|
|
|
|
%conv5 = trunc i16 %retval.0.i to i8
|
|
|
|
%incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
|
|
|
|
store i8 %conv5, i8* %pDst.addr.011, align 1
|
|
|
|
%inc = add nuw i32 %i.014, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
define arm_aapcs_vfpcc void @usatmul_16_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
|
|
|
|
; CHECK-LABEL: usatmul_16_q7:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: cmp r3, #0
|
|
|
|
; CHECK-NEXT: beq .LBB21_8
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: cmp r3, #15
|
|
|
|
; CHECK-NEXT: bhi .LBB21_3
|
|
|
|
; CHECK-NEXT: @ %bb.2:
|
|
|
|
; CHECK-NEXT: movs r5, #0
|
|
|
|
; CHECK-NEXT: mov r12, r0
|
|
|
|
; CHECK-NEXT: mov r6, r1
|
|
|
|
; CHECK-NEXT: mov r4, r2
|
|
|
|
; CHECK-NEXT: b .LBB21_6
|
|
|
|
; CHECK-NEXT: .LBB21_3: @ %vector.ph
|
|
|
|
; CHECK-NEXT: bic r5, r3, #15
|
|
|
|
; CHECK-NEXT: movs r4, #1
|
|
|
|
; CHECK-NEXT: sub.w r6, r5, #16
|
|
|
|
; CHECK-NEXT: add.w r12, r0, r5
|
|
|
|
; CHECK-NEXT: add.w lr, r4, r6, lsr #4
|
|
|
|
; CHECK-NEXT: adds r4, r2, r5
|
|
|
|
; CHECK-NEXT: adds r6, r1, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
|
|
|
; CHECK-NEXT: .LBB21_4: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
|
|
|
|
; CHECK-NEXT: vldrb.u16 q1, [r1, #8]
|
|
|
|
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
|
|
|
; CHECK-NEXT: vldrb.u16 q1, [r1], #16
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.u8 q0, q0
|
|
|
|
; CHECK-NEXT: vstrb.16 q0, [r2, #8]
|
|
|
|
; CHECK-NEXT: vldrb.u16 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
2020-05-16 22:27:20 +08:00
|
|
|
; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
|
2020-05-16 21:54:33 +08:00
|
|
|
; CHECK-NEXT: vmovlb.u8 q0, q0
|
|
|
|
; CHECK-NEXT: vstrb.16 q0, [r2], #16
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: le lr, .LBB21_4
|
|
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
|
|
; CHECK-NEXT: cmp r5, r3
|
2020-08-18 02:42:57 +08:00
|
|
|
; CHECK-NEXT: it eq
|
|
|
|
; CHECK-NEXT: popeq {r4, r5, r6, pc}
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB21_6: @ %for.body.preheader23
|
2020-11-11 00:28:57 +08:00
|
|
|
; CHECK-NEXT: sub.w lr, r3, r5
|
|
|
|
; CHECK-NEXT: dls lr, lr
|
2020-04-06 17:26:40 +08:00
|
|
|
; CHECK-NEXT: .LBB21_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: ldrb r0, [r12], #1
|
|
|
|
; CHECK-NEXT: ldrb r1, [r6], #1
|
|
|
|
; CHECK-NEXT: muls r0, r1, r0
|
|
|
|
; CHECK-NEXT: lsrs r1, r0, #7
|
|
|
|
; CHECK-NEXT: cmp r1, #255
|
|
|
|
; CHECK-NEXT: mov.w r1, #255
|
|
|
|
; CHECK-NEXT: it lo
|
|
|
|
; CHECK-NEXT: lsrlo r1, r0, #7
|
|
|
|
; CHECK-NEXT: strb r1, [r4], #1
|
|
|
|
; CHECK-NEXT: le lr, .LBB21_7
|
|
|
|
; CHECK-NEXT: .LBB21_8: @ %for.cond.cleanup
|
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
|
|
entry:
|
|
|
|
%cmp10 = icmp eq i32 %N, 0
|
|
|
|
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
|
|
%min.iters.check = icmp ult i32 %N, 16
|
|
|
|
br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
|
|
|
|
|
|
|
|
for.body.preheader23: ; preds = %middle.block, %for.body.preheader
|
|
|
|
%i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
|
|
%pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
|
|
|
|
%pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
|
|
|
|
%pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
|
|
%n.vec = and i32 %N, -16
|
|
|
|
%ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
|
|
|
|
%ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
|
|
|
|
%ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr i8, i8* %pSrcA, i32 %index
|
|
|
|
%next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
|
|
|
|
%next.gep21 = getelementptr i8, i8* %pDst, i32 %index
|
|
|
|
%0 = bitcast i8* %next.gep to <16 x i8>*
|
|
|
|
%wide.load = load <16 x i8>, <16 x i8>* %0, align 1
|
|
|
|
%1 = zext <16 x i8> %wide.load to <16 x i16>
|
|
|
|
%2 = bitcast i8* %next.gep20 to <16 x i8>*
|
|
|
|
%wide.load22 = load <16 x i8>, <16 x i8>* %2, align 1
|
|
|
|
%3 = zext <16 x i8> %wide.load22 to <16 x i16>
|
|
|
|
%4 = mul nuw <16 x i16> %3, %1
|
|
|
|
%5 = lshr <16 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
%6 = icmp ult <16 x i16> %5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
|
|
%7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
|
|
%8 = trunc <16 x i16> %7 to <16 x i8>
|
|
|
|
%9 = bitcast i8* %next.gep21 to <16 x i8>*
|
|
|
|
store <16 x i8> %8, <16 x i8>* %9, align 1
|
|
|
|
%index.next = add i32 %index, 16
|
|
|
|
%10 = icmp eq i32 %index.next, %n.vec
|
|
|
|
br i1 %10, label %middle.block, label %vector.body
|
|
|
|
|
|
|
|
middle.block: ; preds = %vector.body
|
|
|
|
%cmp.n = icmp eq i32 %n.vec, %N
|
|
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
|
|
ret void
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.preheader23, %for.body
|
|
|
|
%i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
|
|
|
|
%pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
|
|
|
|
%pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
|
|
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
|
|
|
|
%11 = load i8, i8* %pSrcA.addr.013, align 1
|
|
|
|
%conv1 = zext i8 %11 to i16
|
|
|
|
%incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
|
|
|
|
%12 = load i8, i8* %pSrcB.addr.012, align 1
|
|
|
|
%conv3 = zext i8 %12 to i16
|
|
|
|
%mul = mul nuw i16 %conv3, %conv1
|
|
|
|
%13 = lshr i16 %mul, 7
|
|
|
|
%14 = icmp ult i16 %13, 255
|
|
|
|
%retval.0.i = select i1 %14, i16 %13, i16 255
|
|
|
|
%conv5 = trunc i16 %retval.0.i to i8
|
|
|
|
%incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
|
|
|
|
store i8 %conv5, i8* %pDst.addr.011, align 1
|
|
|
|
%inc = add nuw i32 %i.014, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, %N
|
|
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
}
|
|
|
|
|
|
|
|
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
|
|
|
|
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
|
|
|
|
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
|
|
|
|
declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
|
|
|
|
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
|
|
|
|
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
|
|
|
|
declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
|
|
|
|
declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
|
|
|
|
declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
|
|
|
|
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
|