2016-07-19 21:35:11 +08:00
|
|
|
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s8:
|
|
|
|
; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i8
|
|
|
|
ret i8 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s8_used_by_laneop:
|
|
|
|
; CHECK: addv.8b b[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a2)
|
|
|
|
%1 = trunc i32 %0 to i8
|
|
|
|
%2 = insertelement <8 x i8> %a1, i8 %1, i32 3
|
|
|
|
ret <8 x i8> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s16:
|
|
|
|
; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i16
|
|
|
|
ret i16 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s16_used_by_laneop:
|
|
|
|
; CHECK: addv.4h h[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a2)
|
|
|
|
%1 = trunc i32 %0 to i16
|
|
|
|
%2 = insertelement <4 x i16> %a1, i16 %1, i32 3
|
|
|
|
ret <4 x i16> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define i32 @test_vaddv_s32(<2 x i32> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s32:
|
|
|
|
; 2 x i32 is not supported by the ISA, thus, this is a special case
|
|
|
|
; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret i32 %vaddv.i
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s32_used_by_laneop:
|
|
|
|
; CHECK: addp.2s v[[REGNUM:[0-9]+]], v1, v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a2)
|
|
|
|
%1 = insertelement <2 x i32> %a1, i32 %0, i32 1
|
|
|
|
ret <2 x i32> %1
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define i64 @test_vaddv_s64(<2 x i64> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s64:
|
|
|
|
; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov x0, [[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret i64 %vaddv.i
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_s64_used_by_laneop:
|
|
|
|
; CHECK: addp.2d d[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.d v0[1], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a2)
|
|
|
|
%1 = insertelement <2 x i64> %a1, i64 %0, i64 1
|
|
|
|
ret <2 x i64> %1
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u8:
|
|
|
|
; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i8
|
|
|
|
ret i8 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u8_used_by_laneop:
|
|
|
|
; CHECK: addv.8b b[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a2)
|
|
|
|
%1 = trunc i32 %0 to i8
|
|
|
|
%2 = insertelement <8 x i8> %a1, i8 %1, i32 3
|
|
|
|
ret <8 x i8> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u8_masked:
|
|
|
|
; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = and i32 %vaddv.i, 511 ; 0x1ff
|
|
|
|
ret i32 %0
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u16:
|
|
|
|
; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i16
|
|
|
|
ret i16 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u16_used_by_laneop:
|
|
|
|
; CHECK: addv.4h h[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a2)
|
|
|
|
%1 = trunc i32 %0 to i16
|
|
|
|
%2 = insertelement <4 x i16> %a1, i16 %1, i32 3
|
|
|
|
ret <4 x i16> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u16_masked:
|
|
|
|
; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
|
|
|
|
ret i32 %0
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_vaddv_u32(<2 x i32> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u32:
|
|
|
|
; 2 x i32 is not supported by the ISA, thus, this is a special case
|
|
|
|
; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret i32 %vaddv.i
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u32_used_by_laneop:
|
|
|
|
; CHECK: addp.2s v[[REGNUM:[0-9]+]], v1, v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a2)
|
|
|
|
%1 = insertelement <2 x i32> %a1, i32 %0, i32 1
|
|
|
|
ret <2 x i32> %1
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define float @test_vaddv_f32(<2 x float> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_f32:
|
|
|
|
; CHECK: faddp.2s s0, v0
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret float %vaddv.i
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @test_vaddv_v4f32(<4 x float> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_v4f32:
|
|
|
|
; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
|
|
|
|
; CHECK: faddp.2s s0, [[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret float %vaddv.i
|
|
|
|
}
|
|
|
|
|
|
|
|
define double @test_vaddv_f64(<2 x double> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_f64:
|
|
|
|
; CHECK: faddp.2d d0, v0
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret double %vaddv.i
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @test_vaddv_u64(<2 x i64> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u64:
|
|
|
|
; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov x0, [[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret i64 %vaddv.i
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u64_used_by_laneop:
|
|
|
|
; CHECK: addp.2d d[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.d v0[1], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a2)
|
|
|
|
%1 = insertelement <2 x i64> %a1, i64 %0, i64 1
|
|
|
|
ret <2 x i64> %1
|
|
|
|
}
|
|
|
|
|
2014-04-04 17:03:09 +08:00
|
|
|
define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddv_u64_to_vec:
|
|
|
|
; CHECK: addp.2d d0, v0
|
|
|
|
; CHECK-NOT: fmov
|
|
|
|
; CHECK-NOT: ins
|
|
|
|
; CHECK: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
|
2014-04-04 17:03:09 +08:00
|
|
|
%vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
|
|
|
|
ret <1 x i64> %vec
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_s8:
|
|
|
|
; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i8
|
|
|
|
ret i8 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_s8_used_by_laneop:
|
|
|
|
; CHECK: addv.16b b[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a2)
|
|
|
|
%1 = trunc i32 %0 to i8
|
|
|
|
%2 = insertelement <16 x i8> %a1, i8 %1, i32 3
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_s16:
|
|
|
|
; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i16
|
|
|
|
ret i16 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_s16_used_by_laneop:
|
|
|
|
; CHECK: addv.8h h[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a2)
|
|
|
|
%1 = trunc i32 %0 to i16
|
|
|
|
%2 = insertelement <8 x i16> %a1, i16 %1, i32 3
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define i32 @test_vaddvq_s32(<4 x i32> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_s32:
|
|
|
|
; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, [[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret i32 %vaddv.i
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_s32_used_by_laneop:
|
|
|
|
; CHECK: addv.4s s[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a2)
|
|
|
|
%1 = insertelement <4 x i32> %a1, i32 %0, i32 3
|
|
|
|
ret <4 x i32> %1
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_u8:
|
|
|
|
; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i8
|
|
|
|
ret i8 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_u8_used_by_laneop:
|
|
|
|
; CHECK: addv.16b b[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a2)
|
|
|
|
%1 = trunc i32 %0 to i8
|
|
|
|
%2 = insertelement <16 x i8> %a1, i8 %1, i32 3
|
|
|
|
ret <16 x i8> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_u16:
|
|
|
|
; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov w0, s[[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
%0 = trunc i32 %vaddv.i to i16
|
|
|
|
ret i16 %0
|
|
|
|
}
|
|
|
|
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_u16_used_by_laneop:
|
|
|
|
; CHECK: addv.8h h[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a2)
|
|
|
|
%1 = trunc i32 %0 to i16
|
|
|
|
%2 = insertelement <8 x i16> %a1, i16 %1, i32 3
|
|
|
|
ret <8 x i16> %2
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
define i32 @test_vaddvq_u32(<4 x i32> %a1) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_u32:
|
|
|
|
; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
|
|
|
|
; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
2014-05-24 20:50:23 +08:00
|
|
|
%vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
|
2014-03-29 18:18:08 +08:00
|
|
|
ret i32 %vaddv.i
|
|
|
|
}
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
|
|
|
|
define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
|
|
|
|
; CHECK-LABEL: test_vaddvq_u32_used_by_laneop:
|
|
|
|
; CHECK: addv.4s s[[REGNUM:[0-9]+]], v1
|
[AArch64][TableGen] Skip tied result operands for InstAlias
Summary:
This patch fixes an issue so that the right alias is printed when the instruction has tied operands. It checks the number of operands in the resulting instruction as opposed to the alias, and then skips over tied operands that should not be printed in the alias.
This allows to generate the preferred assembly syntax for the AArch64 'ins' instruction, which should always be displayed as 'mov' according to the ARM Architecture Reference Manual. Several unit tests have changed as a result, but only to reflect the preferred disassembly. Some other InstAlias patterns (movk/bic/orr) needed a slight adjustment to stop them becoming the default and breaking other unit tests.
Please note that the patch is mostly the same as https://reviews.llvm.org/D29219 which was reverted because of an issue found when running TableGen with the Address Sanitizer. That issue has been addressed in this iteration of the patch.
Reviewers: rengolin, stoklund, huntergr, SjoerdMeijer, rovka
Reviewed By: rengolin, SjoerdMeijer
Subscribers: fhahn, aemerson, javed.absar, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D40030
llvm-svn: 318650
2017-11-20 22:36:40 +08:00
|
|
|
; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
|
[AArch64] Avoid going through GPRs for across-vector instructions.
This adds new node types for each intrinsic.
For instance, for addv, we have AArch64ISD::UADDV, such that:
(v4i32 (uaddv ...))
is the same as
(v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...))))
that is,
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 (int_aarch64_neon_uaddv ...)), ssub)
In a combine, we transform all such across-vector-lanes intrinsics to:
(i32 (extract_vector_elt (uaddv ...), 0))
This has one big advantage: by making the extract_element explicit, we
enable the existing patterns for lane-aware instructions to fire.
This lets us avoid needlessly going through the GPRs. Consider:
uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) {
return vmulq_n_u32(a, vaddvq_u32(b));
}
We now generate:
addv.4s s1, v1
mul.4s v0, v0, v1[0]
instead of the previous:
addv.4s s1, v1
fmov w8, s1
dup.4s v1, w8
mul.4s v0, v1, v0
rdar://20044838
llvm-svn: 231840
2015-03-11 04:45:38 +08:00
|
|
|
; CHECK-NEXT: ret
|
|
|
|
entry:
|
|
|
|
%0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a2)
|
|
|
|
%1 = insertelement <4 x i32> %a1, i32 %0, i32 3
|
|
|
|
ret <4 x i32> %1
|
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
|
|
|
|
declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
|
|
|
|
declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
|