llvm-project/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1.ll

; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t

; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning

;
; LD1B
;

define <vscale x 16 x i8> @ld1b_i8(<vscale x 16 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1b_i8:
; CHECK: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pred, i8* %addr)
  ret <vscale x 16 x i8> %res
}

define <vscale x 8 x i16> @ld1b_h(<vscale x 8 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1b_h:
; CHECK: ld1b { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pred, i8* %addr)
  %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
  ret <vscale x 8 x i16> %res
}

define <vscale x 8 x i16> @ld1sb_h(<vscale x 8 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1sb_h:
; CHECK: ld1sb { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pred, i8* %addr)
  %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
  ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @ld1b_s(<vscale x 4 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1b_s:
; CHECK: ld1b { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pred, i8* %addr)
  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
}

define <vscale x 4 x i32> @ld1sb_s(<vscale x 4 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1sb_s:
; CHECK: ld1sb { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pred, i8* %addr)
  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @ld1b_d(<vscale x 2 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1b_d:
; CHECK: ld1b { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pred, i8* %addr)
  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @ld1sb_d(<vscale x 2 x i1> %pred, i8* %addr) {
; CHECK-LABEL: ld1sb_d:
; CHECK: ld1sb { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pred, i8* %addr)
  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

;
; LD1H
;

define <vscale x 8 x i16> @ld1h_i16(<vscale x 8 x i1> %pred, i16* %addr) {
; CHECK-LABEL: ld1h_i16:
; CHECK: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %pred, i16* %addr)
  ret <vscale x 8 x i16> %res
}

define <vscale x 8 x half> @ld1h_f16(<vscale x 8 x i1> %pred, half* %addr) {
; CHECK-LABEL: ld1h_f16:
; CHECK: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1> %pred, half* %addr)
  ret <vscale x 8 x half> %res
}

define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) #0 {
; CHECK-LABEL: ld1h_bf16:
; CHECK: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
  ret <vscale x 8 x bfloat> %res
}

define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %addr) {
; CHECK-LABEL: ld1h_s:
; CHECK: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pred, i16* %addr)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
}

define <vscale x 4 x i32> @ld1sh_s(<vscale x 4 x i1> %pred, i16* %addr) {
; CHECK-LABEL: ld1sh_s:
; CHECK: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pred, i16* %addr)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @ld1h_d(<vscale x 2 x i1> %pred, i16* %addr) {
; CHECK-LABEL: ld1h_d:
; CHECK: ld1h { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pred, i16* %addr)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @ld1sh_d(<vscale x 2 x i1> %pred, i16* %addr) {
; CHECK-LABEL: ld1sh_d:
; CHECK: ld1sh { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pred, i16* %addr)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

;
; LD1W
;

define <vscale x 4 x i32> @ld1w_i32(<vscale x 4 x i1> %pred, i32* %addr) {
; CHECK-LABEL: ld1w_i32:
; CHECK: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %pred, i32* %addr)
  ret <vscale x 4 x i32> %res
}

define <vscale x 4 x float> @ld1w_f32(<vscale x 4 x i1> %pred, float* %addr) {
; CHECK-LABEL: ld1w_f32:
; CHECK: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1> %pred, float* %addr)
  ret <vscale x 4 x float> %res
}

define <vscale x 2 x i64> @ld1w_d(<vscale x 2 x i1> %pred, i32* %addr) {
; CHECK-LABEL: ld1w_d:
; CHECK: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pred, i32* %addr)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @ld1sw_d(<vscale x 2 x i1> %pred, i32* %addr) {
; CHECK-LABEL: ld1sw_d:
; CHECK: ld1sw { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pred, i32* %addr)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
}

;
; LD1D
;

define <vscale x 2 x i64> @ld1d_i64(<vscale x 2 x i1> %pred, i64* %addr) {
; CHECK-LABEL: ld1d_i64:
; CHECK: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1> %pred,
                                                               i64* %addr)
  ret <vscale x 2 x i64> %res
}

define <vscale x 2 x double> @ld1d_f64(<vscale x 2 x i1> %pred, double* %addr) {
; CHECK-LABEL: ld1d_f64:
; CHECK: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %pred,
                                                                  double* %addr)
  ret <vscale x 2 x double> %res
}

declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*)

declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)

declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, i32*)
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1>, float*)

declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1>, i8*)
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1>, i16*)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, i32*)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1>, i64*)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)

; +bf16 is required for the bfloat version.
attributes #0 = { "target-features"="+sve,+bf16" }
[SVE] Add more warnings checks to clang and LLVM SVE tests There are now more SVE tests in LLVM and Clang that do not emit warnings related to invalid use of EVT::getVectorNumElements() and VectorType::getNumElements(). For these tests I have added additional checks that there are no warnings in order to prevent any future regressions. Differential Revision: https://reviews.llvm.org/D82943 2020-06-19 16:06:21 +08:00			`; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t \| FileCheck %s`
[SVE] Fall back on DAG ISel at -O0 when encountering scalable types At the moment we use Global ISel by default at -O0, however it is currently not capable of dealing with scalable vectors for two reasons: 1. The register banks know nothing about SVE registers. 2. The LLT (Low Level Type) class knows nothing about scalable vectors. For now, the easiest way to avoid users hitting issues when using the SVE ACLE is to fall back on normal DAG ISel when encountering instructions that operate on scalable vector types. I've added a couple of RUN lines to existing SVE tests to ensure we can compile at -O0. I've also added some new tests to CodeGen/AArch64/GlobalISel/arm64-fallback.ll that demonstrate we correctly fallback to DAG ISel at -O0 when lowering formal arguments or translating instructions that involve scalable vector types. Differential Revision: https://reviews.llvm.org/D81557 2020-06-09 21:51:38 +08:00			`; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve < %s \| FileCheck %s`
[SVE] Add more warnings checks to clang and LLVM SVE tests There are now more SVE tests in LLVM and Clang that do not emit warnings related to invalid use of EVT::getVectorNumElements() and VectorType::getNumElements(). For these tests I have added additional checks that there are no warnings in order to prevent any future regressions. Differential Revision: https://reviews.llvm.org/D82943 2020-06-19 16:06:21 +08:00			`; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t`

[SVE][CodeGen] Add README for SVE-related warnings in tests I have added a new file: llvm/test/CodeGen/AArch64/README that describes what to do in the event one of the SVE codegen tests fails the warnings check. In addition, I've added comments to all the relevant SVE tests pointing users at the README file. Differential Revision: https://reviews.llvm.org/D83467 2020-07-09 17:16:32 +08:00			`; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.`
[SVE] Add more warnings checks to clang and LLVM SVE tests There are now more SVE tests in LLVM and Clang that do not emit warnings related to invalid use of EVT::getVectorNumElements() and VectorType::getNumElements(). For these tests I have added additional checks that there are no warnings in order to prevent any future regressions. Differential Revision: https://reviews.llvm.org/D82943 2020-06-19 16:06:21 +08:00			`; WARN-NOT: warning`
[AArch64][SVE] Remove LD1/ST1 dependency on llvm.masked.load/store Summary: The SVE masked load and store intrinsics introduced in D76688 rely on common llvm.masked.load/store nodes. This patch creates new ISD nodes for LD1(S) & ST1 to remove this dependency. Additionally, this adds support for sign & zero extending loads and truncating stores. Reviewers: sdesmalen, efriedma, cameron.mcinally, c-rhodes, rengolin Reviewed By: efriedma Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, andwar, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D78204 2020-04-20 18:07:48 +08:00
			`;`
			`; LD1B`
			`;`

			`define <vscale x 16 x i8> @ld1b_i8(<vscale x 16 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1b_i8:`
			`; CHECK: ld1b { z0.b }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pred, i8* %addr)`
			`ret <vscale x 16 x i8> %res`
			`}`

			`define <vscale x 8 x i16> @ld1b_h(<vscale x 8 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1b_h:`
			`; CHECK: ld1b { z0.h }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pred, i8* %addr)`
			`%res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>`
			`ret <vscale x 8 x i16> %res`
			`}`

			`define <vscale x 8 x i16> @ld1sb_h(<vscale x 8 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1sb_h:`
			`; CHECK: ld1sb { z0.h }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pred, i8* %addr)`
			`%res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>`
			`ret <vscale x 8 x i16> %res`
			`}`

			`define <vscale x 4 x i32> @ld1b_s(<vscale x 4 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1b_s:`
			`; CHECK: ld1b { z0.s }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pred, i8* %addr)`
			`%res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>`
			`ret <vscale x 4 x i32> %res`
			`}`

			`define <vscale x 4 x i32> @ld1sb_s(<vscale x 4 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1sb_s:`
			`; CHECK: ld1sb { z0.s }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pred, i8* %addr)`
			`%res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>`
			`ret <vscale x 4 x i32> %res`
			`}`

			`define <vscale x 2 x i64> @ld1b_d(<vscale x 2 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1b_d:`
			`; CHECK: ld1b { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pred, i8* %addr)`
			`%res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>`
			`ret <vscale x 2 x i64> %res`
			`}`

			`define <vscale x 2 x i64> @ld1sb_d(<vscale x 2 x i1> %pred, i8* %addr) {`
			`; CHECK-LABEL: ld1sb_d:`
			`; CHECK: ld1sb { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pred, i8* %addr)`
			`%res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>`
			`ret <vscale x 2 x i64> %res`
			`}`

			`;`
			`; LD1H`
			`;`

			`define <vscale x 8 x i16> @ld1h_i16(<vscale x 8 x i1> %pred, i16* %addr) {`
			`; CHECK-LABEL: ld1h_i16:`
			`; CHECK: ld1h { z0.h }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %pred, i16* %addr)`
			`ret <vscale x 8 x i16> %res`
			`}`

			`define <vscale x 8 x half> @ld1h_f16(<vscale x 8 x i1> %pred, half* %addr) {`
			`; CHECK-LABEL: ld1h_f16:`
			`; CHECK: ld1h { z0.h }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1> %pred, half* %addr)`
			`ret <vscale x 8 x half> %res`
			`}`

[AArch64][SVE] Predicate bfloat16 load patterns with HasBF16 Reviewers: sdesmalen, c-rhodes, efriedma, fpetrogalli Reviewed By: fpetrogalli Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D82464 2020-06-26 16:48:53 +08:00			`define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) #0 {`
[AArch64][SVE] Add bfloat16 support to load intrinsics Summary: Bfloat16 support added for the following intrinsics: - LD1 - LD1RQ - LDNT1 - LDNF1 - LDFF1 Reviewers: sdesmalen, c-rhodes, efriedma, stuij, fpetrogalli, david-arm Reviewed By: fpetrogalli Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D82298 2020-06-24 16:47:49 +08:00			`; CHECK-LABEL: ld1h_bf16:`
			`; CHECK: ld1h { z0.h }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)`
			`ret <vscale x 8 x bfloat> %res`
			`}`

[AArch64][SVE] Remove LD1/ST1 dependency on llvm.masked.load/store Summary: The SVE masked load and store intrinsics introduced in D76688 rely on common llvm.masked.load/store nodes. This patch creates new ISD nodes for LD1(S) & ST1 to remove this dependency. Additionally, this adds support for sign & zero extending loads and truncating stores. Reviewers: sdesmalen, efriedma, cameron.mcinally, c-rhodes, rengolin Reviewed By: efriedma Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, andwar, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D78204 2020-04-20 18:07:48 +08:00			`define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %addr) {`
			`; CHECK-LABEL: ld1h_s:`
			`; CHECK: ld1h { z0.s }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pred, i16* %addr)`
			`%res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>`
			`ret <vscale x 4 x i32> %res`
			`}`

			`define <vscale x 4 x i32> @ld1sh_s(<vscale x 4 x i1> %pred, i16* %addr) {`
			`; CHECK-LABEL: ld1sh_s:`
			`; CHECK: ld1sh { z0.s }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pred, i16* %addr)`
			`%res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>`
			`ret <vscale x 4 x i32> %res`
			`}`

			`define <vscale x 2 x i64> @ld1h_d(<vscale x 2 x i1> %pred, i16* %addr) {`
			`; CHECK-LABEL: ld1h_d:`
			`; CHECK: ld1h { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pred, i16* %addr)`
			`%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>`
			`ret <vscale x 2 x i64> %res`
			`}`

			`define <vscale x 2 x i64> @ld1sh_d(<vscale x 2 x i1> %pred, i16* %addr) {`
			`; CHECK-LABEL: ld1sh_d:`
			`; CHECK: ld1sh { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pred, i16* %addr)`
			`%res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>`
			`ret <vscale x 2 x i64> %res`
			`}`

			`;`
			`; LD1W`
			`;`

			`define <vscale x 4 x i32> @ld1w_i32(<vscale x 4 x i1> %pred, i32* %addr) {`
			`; CHECK-LABEL: ld1w_i32:`
			`; CHECK: ld1w { z0.s }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %pred, i32* %addr)`
			`ret <vscale x 4 x i32> %res`
			`}`

			`define <vscale x 4 x float> @ld1w_f32(<vscale x 4 x i1> %pred, float* %addr) {`
			`; CHECK-LABEL: ld1w_f32:`
			`; CHECK: ld1w { z0.s }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1> %pred, float* %addr)`
			`ret <vscale x 4 x float> %res`
			`}`

			`define <vscale x 2 x i64> @ld1w_d(<vscale x 2 x i1> %pred, i32* %addr) {`
			`; CHECK-LABEL: ld1w_d:`
			`; CHECK: ld1w { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pred, i32* %addr)`
			`%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>`
			`ret <vscale x 2 x i64> %res`
			`}`

			`define <vscale x 2 x i64> @ld1sw_d(<vscale x 2 x i1> %pred, i32* %addr) {`
			`; CHECK-LABEL: ld1sw_d:`
			`; CHECK: ld1sw { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pred, i32* %addr)`
			`%res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>`
			`ret <vscale x 2 x i64> %res`
			`}`

			`;`
			`; LD1D`
			`;`

			`define <vscale x 2 x i64> @ld1d_i64(<vscale x 2 x i1> %pred, i64* %addr) {`
			`; CHECK-LABEL: ld1d_i64:`
			`; CHECK: ld1d { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1> %pred,`
			`i64* %addr)`
			`ret <vscale x 2 x i64> %res`
			`}`

			`define <vscale x 2 x double> @ld1d_f64(<vscale x 2 x i1> %pred, double* %addr) {`
			`; CHECK-LABEL: ld1d_f64:`
			`; CHECK: ld1d { z0.d }, p0/z, [x0]`
			`; CHECK-NEXT: ret`
			`%res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %pred,`
			`double* %addr)`
			`ret <vscale x 2 x double> %res`
			`}`

			`declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*)`

			`declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)`
			`declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)`
			`declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)`
[AArch64][SVE] Add bfloat16 support to load intrinsics Summary: Bfloat16 support added for the following intrinsics: - LD1 - LD1RQ - LDNT1 - LDNF1 - LDFF1 Reviewers: sdesmalen, c-rhodes, efriedma, stuij, fpetrogalli, david-arm Reviewed By: fpetrogalli Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D82298 2020-06-24 16:47:49 +08:00			`declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)`
[AArch64][SVE] Remove LD1/ST1 dependency on llvm.masked.load/store Summary: The SVE masked load and store intrinsics introduced in D76688 rely on common llvm.masked.load/store nodes. This patch creates new ISD nodes for LD1(S) & ST1 to remove this dependency. Additionally, this adds support for sign & zero extending loads and truncating stores. Reviewers: sdesmalen, efriedma, cameron.mcinally, c-rhodes, rengolin Reviewed By: efriedma Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, andwar, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D78204 2020-04-20 18:07:48 +08:00
			`declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)`
			`declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)`
			`declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, i32*)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1>, float*)`

			`declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1>, i8*)`
			`declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1>, i16*)`
			`declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, i32*)`
			`declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1>, i64*)`
			`declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)`
[AArch64][SVE] Predicate bfloat16 load patterns with HasBF16 Reviewers: sdesmalen, c-rhodes, efriedma, fpetrogalli Reviewed By: fpetrogalli Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D82464 2020-06-26 16:48:53 +08:00
			`; +bf16 is required for the bfloat version.`
			`attributes #0 = { "target-features"="+sve,+bf16" }`