llvm-project/llvm/test/CodeGen/AArch64/arm64-uminv.ll

; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s

define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
; CHECK-LABEL: vmin_u8x8:
; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
; CHECK-NOT: and
; CHECK: cbz     [[REG2]],
entry:
  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
  %tmp = trunc i32 %vminv.i to i8
  %tobool = icmp eq i8 %tmp, 0
  br i1 %tobool, label %return, label %if.then

if.then:
  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
  br label %return

return:
  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
  ret i32 %retval.0
}

declare i32 @bar(...)

define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
; CHECK-LABEL: vmin_u4x16:
; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
; CHECK-NOT: and
; CHECK: cbz     [[REG2]],
entry:
  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
  %tmp = trunc i32 %vminv.i to i16
  %tobool = icmp eq i16 %tmp, 0
  br i1 %tobool, label %return, label %if.then

if.then:
  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
  br label %return

return:
  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
  ret i32 %retval.0
}

define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
; CHECK-LABEL: vmin_u8x16:
; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
; CHECK-NOT: and
; CHECK: cbz     [[REG2]],
entry:
  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
  %tmp = trunc i32 %vminv.i to i16
  %tobool = icmp eq i16 %tmp, 0
  br i1 %tobool, label %return, label %if.then

if.then:
  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
  br label %return

return:
  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
  ret i32 %retval.0
}

define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
; CHECK-LABEL: vmin_u16x8:
; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
; CHECK-NOT: and
; CHECK: cbz     [[REG2]],
entry:
  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
  %tmp = trunc i32 %vminv.i to i8
  %tobool = icmp eq i8 %tmp, 0
  br i1 %tobool, label %return, label %if.then

if.then:
  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
  br label %return

return:
  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
  ret i32 %retval.0
}

define <8 x i8> @test_vminv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
; CHECK-LABEL: test_vminv_u8_used_by_laneop:
; CHECK: uminv.8b b[[REGNUM:[0-9]+]], v1
; CHECK-NEXT: ins.b v0[3], v[[REGNUM]][0]
; CHECK-NEXT: ret
entry:
  %0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a2)
  %1 = trunc i32 %0 to i8
  %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
  ret <8 x i8> %2
}

define <4 x i16> @test_vminv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
; CHECK-LABEL: test_vminv_u16_used_by_laneop:
; CHECK: uminv.4h h[[REGNUM:[0-9]+]], v1
; CHECK-NEXT: ins.h v0[3], v[[REGNUM]][0]
; CHECK-NEXT: ret
entry:
  %0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a2)
  %1 = trunc i32 %0 to i16
  %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
  ret <4 x i16> %2
}

define <2 x i32> @test_vminv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
; CHECK-LABEL: test_vminv_u32_used_by_laneop:
; CHECK: uminp.2s v[[REGNUM:[0-9]+]], v1, v1
; CHECK-NEXT: ins.s v0[1], v[[REGNUM]][0]
; CHECK-NEXT: ret
entry:
  %0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a2)
  %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
  ret <2 x i32> %1
}

define <16 x i8> @test_vminvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
; CHECK-LABEL: test_vminvq_u8_used_by_laneop:
; CHECK: uminv.16b b[[REGNUM:[0-9]+]], v1
; CHECK-NEXT: ins.b v0[3], v[[REGNUM]][0]
; CHECK-NEXT: ret
entry:
  %0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a2)
  %1 = trunc i32 %0 to i8
  %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
  ret <16 x i8> %2
}

define <8 x i16> @test_vminvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: test_vminvq_u16_used_by_laneop:
; CHECK: uminv.8h h[[REGNUM:[0-9]+]], v1
; CHECK-NEXT: ins.h v0[3], v[[REGNUM]][0]
; CHECK-NEXT: ret
entry:
  %0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a2)
  %1 = trunc i32 %0 to i16
  %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
  ret <8 x i16> %2
}

define <4 x i32> @test_vminvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_vminvq_u32_used_by_laneop:
; CHECK: uminv.4s s[[REGNUM:[0-9]+]], v1
; CHECK-NEXT: ins.s v0[3], v[[REGNUM]][0]
; CHECK-NEXT: ret
entry:
  %0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a2)
  %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
  ret <4 x i32> %1
}
declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
declare i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32>) nounwind readnone
declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>) nounwind readnone
[AARCH64] Enable AARCH64 lit tests on windows dev machines As discussed on PR27654, this patch fixes the triples of a lot of aarch64 tests and enables lit tests on windows This will hopefully help stop cases where windows developers break the aarch64 target Differential Revision: https://reviews.llvm.org/D22191 llvm-svn: 275973 2016-07-19 21:35:11 +08:00			`; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false \| FileCheck %s`
ARM64: initial backend import This adds a second implementation of the AArch64 architecture to LLVM, accessible in parallel via the "arm64" triple. The plan over the coming weeks & months is to merge the two into a single backend, during which time thorough code review should naturally occur. Everything will be easier with the target in-tree though, hence this commit. llvm-svn: 205090 2014-03-29 18:18:08 +08:00
			`define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {`
			`; CHECK-LABEL: vmin_u8x8:`
			`; CHECK: uminv.8b b[[REG:[0-9]+]], v0`
			`; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]`
			`; CHECK-NOT: and`
			`; CHECK: cbz [[REG2]],`
			`entry:`
AArch64/ARM64: move ARM64 into AArch64's place This commit starts with a "git mv ARM64 AArch64" and continues out from there, renaming the C++ classes, intrinsics, and other target-local objects for consistency. "ARM64" test directories are also moved, and tests that began their life in ARM64 use an arm64 triple, those from AArch64 use an aarch64 triple. Both should be equivalent though. This finishes the AArch64 merge, and everyone should feel free to continue committing as normal now. llvm-svn: 209577 2014-05-24 20:50:23 +08:00			`%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind`
ARM64: initial backend import This adds a second implementation of the AArch64 architecture to LLVM, accessible in parallel via the "arm64" triple. The plan over the coming weeks & months is to merge the two into a single backend, during which time thorough code review should naturally occur. Everything will be easier with the target in-tree though, hence this commit. llvm-svn: 205090 2014-03-29 18:18:08 +08:00			`%tmp = trunc i32 %vminv.i to i8`
			`%tobool = icmp eq i8 %tmp, 0`
			`br i1 %tobool, label %return, label %if.then`

			`if.then:`
			`%call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind`
			`br label %return`

			`return:`
			`%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]`
			`ret i32 %retval.0`
			`}`

			`declare i32 @bar(...)`

			`define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {`
			`; CHECK-LABEL: vmin_u4x16:`
			`; CHECK: uminv.4h h[[REG:[0-9]+]], v0`
			`; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]`
			`; CHECK-NOT: and`
			`; CHECK: cbz [[REG2]],`
			`entry:`
AArch64/ARM64: move ARM64 into AArch64's place This commit starts with a "git mv ARM64 AArch64" and continues out from there, renaming the C++ classes, intrinsics, and other target-local objects for consistency. "ARM64" test directories are also moved, and tests that began their life in ARM64 use an arm64 triple, those from AArch64 use an aarch64 triple. Both should be equivalent though. This finishes the AArch64 merge, and everyone should feel free to continue committing as normal now. llvm-svn: 209577 2014-05-24 20:50:23 +08:00			`%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind`
ARM64: initial backend import This adds a second implementation of the AArch64 architecture to LLVM, accessible in parallel via the "arm64" triple. The plan over the coming weeks & months is to merge the two into a single backend, during which time thorough code review should naturally occur. Everything will be easier with the target in-tree though, hence this commit. llvm-svn: 205090 2014-03-29 18:18:08 +08:00			`%tmp = trunc i32 %vminv.i to i16`
			`%tobool = icmp eq i16 %tmp, 0`
			`br i1 %tobool, label %return, label %if.then`

			`if.then:`
			`%call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind`
			`br label %return`

			`return:`
			`%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]`
			`ret i32 %retval.0`
			`}`

			`define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {`
			`; CHECK-LABEL: vmin_u8x16:`
			`; CHECK: uminv.8h h[[REG:[0-9]+]], v0`
			`; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]`
			`; CHECK-NOT: and`
			`; CHECK: cbz [[REG2]],`
			`entry:`
AArch64/ARM64: move ARM64 into AArch64's place This commit starts with a "git mv ARM64 AArch64" and continues out from there, renaming the C++ classes, intrinsics, and other target-local objects for consistency. "ARM64" test directories are also moved, and tests that began their life in ARM64 use an arm64 triple, those from AArch64 use an aarch64 triple. Both should be equivalent though. This finishes the AArch64 merge, and everyone should feel free to continue committing as normal now. llvm-svn: 209577 2014-05-24 20:50:23 +08:00			`%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind`
ARM64: initial backend import This adds a second implementation of the AArch64 architecture to LLVM, accessible in parallel via the "arm64" triple. The plan over the coming weeks & months is to merge the two into a single backend, during which time thorough code review should naturally occur. Everything will be easier with the target in-tree though, hence this commit. llvm-svn: 205090 2014-03-29 18:18:08 +08:00			`%tmp = trunc i32 %vminv.i to i16`
			`%tobool = icmp eq i16 %tmp, 0`
			`br i1 %tobool, label %return, label %if.then`

			`if.then:`
			`%call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind`
			`br label %return`

			`return:`
			`%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]`
			`ret i32 %retval.0`
			`}`

			`define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {`
			`; CHECK-LABEL: vmin_u16x8:`
			`; CHECK: uminv.16b b[[REG:[0-9]+]], v0`
			`; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]`
			`; CHECK-NOT: and`
			`; CHECK: cbz [[REG2]],`
			`entry:`
AArch64/ARM64: move ARM64 into AArch64's place This commit starts with a "git mv ARM64 AArch64" and continues out from there, renaming the C++ classes, intrinsics, and other target-local objects for consistency. "ARM64" test directories are also moved, and tests that began their life in ARM64 use an arm64 triple, those from AArch64 use an aarch64 triple. Both should be equivalent though. This finishes the AArch64 merge, and everyone should feel free to continue committing as normal now. llvm-svn: 209577 2014-05-24 20:50:23 +08:00			`%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind`
ARM64: initial backend import This adds a second implementation of the AArch64 architecture to LLVM, accessible in parallel via the "arm64" triple. The plan over the coming weeks & months is to merge the two into a single backend, during which time thorough code review should naturally occur. Everything will be easier with the target in-tree though, hence this commit. llvm-svn: 205090 2014-03-29 18:18:08 +08:00			`%tmp = trunc i32 %vminv.i to i8`
			`%tobool = icmp eq i8 %tmp, 0`
			`br i1 %tobool, label %return, label %if.then`

			`if.then:`
			`%call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind`
			`br label %return`

			`return:`
			`%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]`
			`ret i32 %retval.0`
			`}`

[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`define <8 x i8> @test_vminv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {`
			`; CHECK-LABEL: test_vminv_u8_used_by_laneop:`
			`; CHECK: uminv.8b b[[REGNUM:[0-9]+]], v1`
Revert r294437 as it broke an asan buildbot. llvm-svn: 294523 2017-02-09 05:41:16 +08:00			`; CHECK-NEXT: ins.b v0[3], v[[REGNUM]][0]`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`; CHECK-NEXT: ret`
			`entry:`
			`%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a2)`
			`%1 = trunc i32 %0 to i8`
			`%2 = insertelement <8 x i8> %a1, i8 %1, i32 3`
			`ret <8 x i8> %2`
			`}`

			`define <4 x i16> @test_vminv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {`
			`; CHECK-LABEL: test_vminv_u16_used_by_laneop:`
			`; CHECK: uminv.4h h[[REGNUM:[0-9]+]], v1`
Revert r294437 as it broke an asan buildbot. llvm-svn: 294523 2017-02-09 05:41:16 +08:00			`; CHECK-NEXT: ins.h v0[3], v[[REGNUM]][0]`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`; CHECK-NEXT: ret`
			`entry:`
			`%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a2)`
			`%1 = trunc i32 %0 to i16`
			`%2 = insertelement <4 x i16> %a1, i16 %1, i32 3`
			`ret <4 x i16> %2`
			`}`

			`define <2 x i32> @test_vminv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {`
			`; CHECK-LABEL: test_vminv_u32_used_by_laneop:`
			`; CHECK: uminp.2s v[[REGNUM:[0-9]+]], v1, v1`
Revert r294437 as it broke an asan buildbot. llvm-svn: 294523 2017-02-09 05:41:16 +08:00			`; CHECK-NEXT: ins.s v0[1], v[[REGNUM]][0]`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`; CHECK-NEXT: ret`
			`entry:`
			`%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a2)`
			`%1 = insertelement <2 x i32> %a1, i32 %0, i32 1`
			`ret <2 x i32> %1`
			`}`

			`define <16 x i8> @test_vminvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {`
			`; CHECK-LABEL: test_vminvq_u8_used_by_laneop:`
			`; CHECK: uminv.16b b[[REGNUM:[0-9]+]], v1`
Revert r294437 as it broke an asan buildbot. llvm-svn: 294523 2017-02-09 05:41:16 +08:00			`; CHECK-NEXT: ins.b v0[3], v[[REGNUM]][0]`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`; CHECK-NEXT: ret`
			`entry:`
			`%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a2)`
			`%1 = trunc i32 %0 to i8`
			`%2 = insertelement <16 x i8> %a1, i8 %1, i32 3`
			`ret <16 x i8> %2`
			`}`

			`define <8 x i16> @test_vminvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {`
			`; CHECK-LABEL: test_vminvq_u16_used_by_laneop:`
			`; CHECK: uminv.8h h[[REGNUM:[0-9]+]], v1`
Revert r294437 as it broke an asan buildbot. llvm-svn: 294523 2017-02-09 05:41:16 +08:00			`; CHECK-NEXT: ins.h v0[3], v[[REGNUM]][0]`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`; CHECK-NEXT: ret`
			`entry:`
			`%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a2)`
			`%1 = trunc i32 %0 to i16`
			`%2 = insertelement <8 x i16> %a1, i16 %1, i32 3`
			`ret <8 x i16> %2`
			`}`

			`define <4 x i32> @test_vminvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {`
			`; CHECK-LABEL: test_vminvq_u32_used_by_laneop:`
			`; CHECK: uminv.4s s[[REGNUM:[0-9]+]], v1`
Revert r294437 as it broke an asan buildbot. llvm-svn: 294523 2017-02-09 05:41:16 +08:00			`; CHECK-NEXT: ins.s v0[3], v[[REGNUM]][0]`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`; CHECK-NEXT: ret`
			`entry:`
			`%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a2)`
			`%1 = insertelement <4 x i32> %a1, i32 %0, i32 3`
			`ret <4 x i32> %1`
			`}`
AArch64/ARM64: move ARM64 into AArch64's place This commit starts with a "git mv ARM64 AArch64" and continues out from there, renaming the C++ classes, intrinsics, and other target-local objects for consistency. "ARM64" test directories are also moved, and tests that began their life in ARM64 use an arm64 triple, those from AArch64 use an aarch64 triple. Both should be equivalent though. This finishes the AArch64 merge, and everyone should feel free to continue committing as normal now. llvm-svn: 209577 2014-05-24 20:50:23 +08:00			`declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone`
			`declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone`
			`declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone`
			`declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone`
[AArch64] Avoid going through GPRs for across-vector instructions. This adds new node types for each intrinsic. For instance, for addv, we have AArch64ISD::UADDV, such that: (v4i32 (uaddv ...)) is the same as (v4i32 (scalar_to_vector (i32 (int_aarch64_neon_uaddv ...)))) that is, (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 (int_aarch64_neon_uaddv ...)), ssub) In a combine, we transform all such across-vector-lanes intrinsics to: (i32 (extract_vector_elt (uaddv ...), 0)) This has one big advantage: by making the extract_element explicit, we enable the existing patterns for lane-aware instructions to fire. This lets us avoid needlessly going through the GPRs. Consider: uint32x4_t test_mul(uint32x4_t a, uint32x4_t b) { return vmulq_n_u32(a, vaddvq_u32(b)); } We now generate: addv.4s s1, v1 mul.4s v0, v0, v1[0] instead of the previous: addv.4s s1, v1 fmov w8, s1 dup.4s v1, w8 mul.4s v0, v1, v0 rdar://20044838 llvm-svn: 231840 2015-03-11 04:45:38 +08:00			`declare i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32>) nounwind readnone`
			`declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>) nounwind readnone`