forked from OSchip/llvm-project
[AArch64][SVE] Add bfloat16 support to load intrinsics
Summary: Bfloat16 support added for the following intrinsics: - LD1 - LD1RQ - LDNT1 - LDNF1 - LDFF1 Reviewers: sdesmalen, c-rhodes, efriedma, stuij, fpetrogalli, david-arm Reviewed By: fpetrogalli Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D82298
This commit is contained in:
parent
1b090db0df
commit
3d6cab271c
|
@ -271,6 +271,11 @@ def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtRetu
|
|||
def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">;
|
||||
def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">;
|
||||
|
||||
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
|
||||
def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
|
||||
def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
|
||||
}
|
||||
|
||||
// Load one vector (scalar base, VL displacement)
|
||||
def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
|
||||
def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">;
|
||||
|
@ -376,6 +381,11 @@ def SVLDFF1UH_VNUM : MInst<"svldff1uh_vnum_{d}", "dPXl", "ilUiUl", [IsL
|
|||
def SVLDFF1SW_VNUM : MInst<"svldff1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ldff1">;
|
||||
def SVLDFF1UW_VNUM : MInst<"svldff1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ldff1">;
|
||||
|
||||
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
|
||||
def SVLDFF1_BF : MInst<"svldff1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
|
||||
def SVLDFF1_VNUM_BF : MInst<"svldff1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
|
||||
}
|
||||
|
||||
// First-faulting load one vector (vector base)
|
||||
def SVLDFF1_GATHER_BASES_U : MInst<"svldff1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ldff1_gather_scalar_offset">;
|
||||
def SVLDFF1SB_GATHER_BASES_U : MInst<"svldff1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl", [IsGatherLoad], MemEltTyInt8, "aarch64_sve_ldff1_gather_scalar_offset">;
|
||||
|
@ -471,15 +481,29 @@ def SVLDNF1UH_VNUM : MInst<"svldnf1uh_vnum_{d}", "dPXl", "ilUiUl", [IsL
|
|||
def SVLDNF1SW_VNUM : MInst<"svldnf1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ldnf1">;
|
||||
def SVLDNF1UW_VNUM : MInst<"svldnf1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ldnf1">;
|
||||
|
||||
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
|
||||
def SVLDNF1_BF : MInst<"svldnf1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
|
||||
def SVLDNF1_VNUM_BF : MInst<"svldnf1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
|
||||
}
|
||||
|
||||
// Load one vector, unextended load, non-temporal (scalar base)
|
||||
def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
|
||||
|
||||
// Load one vector, unextended load, non-temporal (scalar base, VL displacement)
|
||||
def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
|
||||
|
||||
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
|
||||
def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
|
||||
def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
|
||||
}
|
||||
|
||||
// Load one quadword and replicate (scalar base)
|
||||
def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">;
|
||||
|
||||
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
|
||||
def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">;
|
||||
}
|
||||
|
||||
multiclass StructLoad<string name, string proto, string i> {
|
||||
def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructLoad]>;
|
||||
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef SVE_OVERLOADED_FORMS
|
||||
// A simple used,unused... macro, long enough to represent any SVE builtin.
|
||||
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
|
||||
#else
|
||||
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
|
||||
#endif
|
||||
|
||||
svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base)
|
||||
{
|
||||
// CHECK-LABEL: test_svld1_bf16
|
||||
// CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svld1_bf16'}}
|
||||
return SVE_ACLE_FUNC(svld1,_bf16,,)(pg, base);
|
||||
}
|
||||
|
||||
svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
|
||||
{
|
||||
// CHECK-LABEL: test_svld1_vnum_bf16
|
||||
// CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
|
||||
// CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svld1_vnum_bf16'}}
|
||||
return SVE_ACLE_FUNC(svld1_vnum,_bf16,,)(pg, base, vnum);
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef SVE_OVERLOADED_FORMS
|
||||
// A simple used,unused... macro, long enough to represent any SVE builtin.
|
||||
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
|
||||
#else
|
||||
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
|
||||
#endif
|
||||
|
||||
svbfloat16_t test_svld1rq_bf16(svbool_t pg, const bfloat16_t *base)
|
||||
{
|
||||
// CHECK-LABEL: test_svld1rq_bf16
|
||||
// CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svld1rq_bf16'}}
|
||||
return SVE_ACLE_FUNC(svld1rq,_bf16,,)(pg, base);
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef SVE_OVERLOADED_FORMS
|
||||
// A simple used,unused... macro, long enough to represent any SVE builtin.
|
||||
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
|
||||
#else
|
||||
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
|
||||
#endif
|
||||
|
||||
svbfloat16_t test_svldff1_bf16(svbool_t pg, const bfloat16_t *base)
|
||||
{
|
||||
// CHECK-LABEL: test_svldff1_bf16
|
||||
// CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svldff1_bf16'}}
|
||||
return SVE_ACLE_FUNC(svldff1,_bf16,,)(pg, base);
|
||||
}
|
||||
|
||||
svbfloat16_t test_svldff1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
|
||||
{
|
||||
// CHECK-LABEL: test_svldff1_vnum_bf16
|
||||
// CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
|
||||
// CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
|
||||
// CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svldff1_vnum_bf16'}}
|
||||
return SVE_ACLE_FUNC(svldff1_vnum,_bf16,,)(pg, base, vnum);
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef SVE_OVERLOADED_FORMS
|
||||
// A simple used,unused... macro, long enough to represent any SVE builtin.
|
||||
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
|
||||
#else
|
||||
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
|
||||
#endif
|
||||
|
||||
svbfloat16_t test_svldnf1_bf16(svbool_t pg, const bfloat16_t *base)
|
||||
{
|
||||
// CHECK-LABEL: test_svldnf1_bf16
|
||||
// CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svldnf1_bf16'}}
|
||||
return SVE_ACLE_FUNC(svldnf1,_bf16,,)(pg, base);
|
||||
}
|
||||
|
||||
svbfloat16_t test_svldnf1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
|
||||
{
|
||||
// CHECK-LABEL: test_svldnf1_vnum_bf16
|
||||
// CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
|
||||
// CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
|
||||
// CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svldnf1_vnum_bf16'}}
|
||||
return SVE_ACLE_FUNC(svldnf1_vnum,_bf16,,)(pg, base, vnum);
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef SVE_OVERLOADED_FORMS
|
||||
// A simple used,unused... macro, long enough to represent any SVE builtin.
|
||||
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
|
||||
#else
|
||||
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
|
||||
#endif
|
||||
|
||||
svbfloat16_t test_svldnt1_bf16(svbool_t pg, const bfloat16_t *base)
|
||||
{
|
||||
// CHECK-LABEL: test_svldnt1_bf16
|
||||
// CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svldnt1_bf16'}}
|
||||
return SVE_ACLE_FUNC(svldnt1,_bf16,,)(pg, base);
|
||||
}
|
||||
|
||||
svbfloat16_t test_svldnt1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
|
||||
{
|
||||
// CHECK-LABEL: test_svldnt1_vnum_bf16
|
||||
// CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
|
||||
// CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
|
||||
// CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
|
||||
// CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
|
||||
// CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
|
||||
// expected-warning@+1 {{implicit declaration of function 'svldnt1_vnum_bf16'}}
|
||||
return SVE_ACLE_FUNC(svldnt1_vnum,_bf16,,)(pg, base, vnum);
|
||||
}
|
|
@ -1518,10 +1518,11 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
|||
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
|
||||
|
||||
// 8-element contiguous loads
|
||||
defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
|
||||
// 16-element contiguous loads
|
||||
defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
|
||||
|
@ -1704,10 +1705,11 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
|||
defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
|
||||
|
||||
// 8-element contiguous loads
|
||||
defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
|
||||
defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
|
||||
defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
|
||||
defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
|
||||
defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
|
||||
|
||||
// 16-element contiguous loads
|
||||
defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
|
||||
|
@ -1725,31 +1727,32 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
|||
}
|
||||
|
||||
// 2-element contiguous non-faulting loads
|
||||
defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i8>;
|
||||
defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>;
|
||||
defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i16>;
|
||||
defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>;
|
||||
defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i32>;
|
||||
defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>;
|
||||
defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i64>;
|
||||
defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1, nxv2i1, nxv2f64>;
|
||||
defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i8>;
|
||||
defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>;
|
||||
defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i16>;
|
||||
defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>;
|
||||
defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i32>;
|
||||
defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>;
|
||||
defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i64>;
|
||||
defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1, nxv2i1, nxv2f64>;
|
||||
|
||||
// 4-element contiguous non-faulting loads
|
||||
defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i8>;
|
||||
defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>;
|
||||
defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i16>;
|
||||
defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>;
|
||||
defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i32>;
|
||||
defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1, nxv4i1, nxv4f32>;
|
||||
defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i8>;
|
||||
defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>;
|
||||
defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i16>;
|
||||
defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>;
|
||||
defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i32>;
|
||||
defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1, nxv4i1, nxv4f32>;
|
||||
|
||||
// 8-element contiguous non-faulting loads
|
||||
defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i8>;
|
||||
defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>;
|
||||
defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i16>;
|
||||
defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1, nxv8i1, nxv8f16>;
|
||||
defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i8>;
|
||||
defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>;
|
||||
defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i16>;
|
||||
defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1, nxv8i1, nxv8f16>;
|
||||
defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1, nxv8i1, nxv8bf16>;
|
||||
|
||||
// 16-element contiguous non-faulting loads
|
||||
defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
|
||||
defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
|
||||
|
||||
multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
|
||||
// reg + reg
|
||||
|
@ -1764,29 +1767,30 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
|||
}
|
||||
|
||||
// 2-element contiguous first faulting loads
|
||||
defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
|
||||
defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
|
||||
defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
|
||||
defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
|
||||
|
||||
// 4-element contiguous first faulting loads
|
||||
defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
|
||||
defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
|
||||
|
||||
// 8-element contiguous first faulting loads
|
||||
defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
|
||||
defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
|
||||
defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
|
||||
|
||||
// 16-element contiguous first faulting loads
|
||||
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
|
||||
|
|
|
@ -161,7 +161,8 @@ public:
|
|||
return false;
|
||||
|
||||
Type *Ty = cast<VectorType>(DataType)->getElementType();
|
||||
if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
|
||||
if (Ty->isBFloatTy() || Ty->isHalfTy() ||
|
||||
Ty->isFloatTy() || Ty->isDoubleTy())
|
||||
return true;
|
||||
|
||||
if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
|
||||
|
|
|
@ -207,6 +207,17 @@ define <vscale x 8 x half> @ld1h_f16_inbound(<vscale x 8 x i1> %pg, half* %a) {
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ld1h_bf16_inbound(<vscale x 8 x i1> %pg, bfloat* %a) {
|
||||
; CHECK-LABEL: ld1h_bf16_inbound:
|
||||
; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
|
||||
%base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 1
|
||||
%base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
|
||||
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base_scalar)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
;
|
||||
; LD1W
|
||||
;
|
||||
|
@ -288,6 +299,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*
|
|||
declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
|
||||
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
|
||||
|
|
|
@ -95,6 +95,15 @@ define <vscale x 8 x half> @ld1h_f16(<vscale x 8 x i1> %pg, half* %a, i64 %index
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pg, bfloat* %a, i64 %index) {
|
||||
; CHECK-LABEL: ld1h_bf16
|
||||
; CHECK: ld1h { z0.h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base = getelementptr bfloat, bfloat* %a, i64 %index
|
||||
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %a, i64 %index) {
|
||||
; CHECK-LABEL: ld1h_s:
|
||||
; CHECK: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
|
||||
|
@ -204,6 +213,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*
|
|||
declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
|
||||
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
|
||||
|
|
|
@ -87,6 +87,14 @@ define <vscale x 8 x half> @ld1h_f16(<vscale x 8 x i1> %pred, half* %addr) {
|
|||
ret <vscale x 8 x half> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
; CHECK-LABEL: ld1h_bf16:
|
||||
; CHECK: ld1h { z0.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
|
||||
ret <vscale x 8 x bfloat> %res
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %addr) {
|
||||
; CHECK-LABEL: ld1h_s:
|
||||
; CHECK: ld1h { z0.s }, p0/z, [x0]
|
||||
|
@ -188,6 +196,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*
|
|||
declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
|
||||
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
|
||||
|
|
|
@ -206,6 +206,14 @@ define <vscale x 8 x half> @ldff1h_f16(<vscale x 8 x i1> %pg, half* %a) {
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ldff1h_bf16(<vscale x 8 x i1> %pg, bfloat* %a) {
|
||||
; CHECK-LABEL: ldff1h_bf16:
|
||||
; CHECK: ldff1h { z0.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %a)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x half> @ldff1h_f16_reg(<vscale x 8 x i1> %pg, half* %a, i64 %offset) {
|
||||
; CHECK-LABEL: ldff1h_f16_reg:
|
||||
; CHECK: ldff1h { z0.h }, p0/z, [x0, x1, lsl #1]
|
||||
|
@ -215,6 +223,15 @@ define <vscale x 8 x half> @ldff1h_f16_reg(<vscale x 8 x i1> %pg, half* %a, i64
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ldff1h_bf16_reg(<vscale x 8 x i1> %pg, bfloat* %a, i64 %offset) {
|
||||
; CHECK-LABEL: ldff1h_bf16_reg:
|
||||
; CHECK: ldff1h { z0.h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base = getelementptr bfloat, bfloat* %a, i64 %offset
|
||||
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
;
|
||||
; LDFF1SH
|
||||
;
|
||||
|
@ -398,6 +415,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1>, i
|
|||
declare <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1>, i8*)
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1>, i8*)
|
||||
declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1>, i16*)
|
||||
|
|
|
@ -140,6 +140,14 @@ define <vscale x 8 x half> @ldnf1h_f16(<vscale x 8 x i1> %pg, half* %a) {
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ldnf1h_bf16(<vscale x 8 x i1> %pg, bfloat* %a) {
|
||||
; CHECK-LABEL: ldnf1h_bf16:
|
||||
; CHECK: ldnf1h { z0.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %a)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x half> @ldnf1h_f16_inbound(<vscale x 8 x i1> %pg, half* %a) {
|
||||
; CHECK-LABEL: ldnf1h_f16_inbound:
|
||||
; CHECK: ldnf1h { z0.h }, p0/z, [x0, #1, mul vl]
|
||||
|
@ -151,6 +159,17 @@ define <vscale x 8 x half> @ldnf1h_f16_inbound(<vscale x 8 x i1> %pg, half* %a)
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ldnf1h_bf16_inbound(<vscale x 8 x i1> %pg, bfloat* %a) {
|
||||
; CHECK-LABEL: ldnf1h_bf16_inbound:
|
||||
; CHECK: ldnf1h { z0.h }, p0/z, [x0, #1, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
|
||||
%base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 1
|
||||
%base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
|
||||
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base_scalar)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @ldnf1b_s(<vscale x 4 x i1> %pg, i8* %a) {
|
||||
; CHECK-LABEL: ldnf1b_s:
|
||||
; CHECK: ldnf1b { z0.s }, p0/z, [x0]
|
||||
|
@ -442,6 +461,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1>, i
|
|||
declare <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1>, i8*)
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1>, i8*)
|
||||
declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1>, i16*)
|
||||
|
|
|
@ -97,6 +97,23 @@ define <vscale x 8 x half> @ld1rqh_f16_imm(<vscale x 8 x i1> %pred, half* %addr)
|
|||
ret <vscale x 8 x half> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
; CHECK-LABEL: ld1rqh_bf16:
|
||||
; CHECK: ld1rqh { z0.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
|
||||
ret <vscale x 8 x bfloat> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
; CHECK-LABEL: ld1rqh_bf16_imm:
|
||||
; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16]
|
||||
; CHECK-NEXT: ret
|
||||
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i16 -8
|
||||
%res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %ptr)
|
||||
ret <vscale x 8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD1RQW
|
||||
;
|
||||
|
@ -208,6 +225,15 @@ define <vscale x 8 x half> @ldnt1h_f16(<vscale x 8 x i1> %pred, half* %addr) {
|
|||
ret <vscale x 8 x half> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @ldnt1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
; CHECK-LABEL: ldnt1h_bf16:
|
||||
; CHECK: ldnt1h { z0.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %pred,
|
||||
bfloat* %addr)
|
||||
ret <vscale x 8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LDNT1W
|
||||
;
|
||||
|
@ -498,6 +524,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, i1
|
|||
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, i32*)
|
||||
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1>, i64*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1>, float*)
|
||||
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1>, double*)
|
||||
|
||||
|
@ -506,6 +533,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i1
|
|||
declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, i32*)
|
||||
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i64*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*)
|
||||
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*)
|
||||
|
||||
|
|
|
@ -87,6 +87,14 @@ define <vscale x 8 x half> @masked_load_nxv8f16(<vscale x 8 x half> *%a, <vscale
|
|||
ret <vscale x 8 x half> %load
|
||||
}
|
||||
|
||||
define <vscale x 8 x bfloat> @masked_load_nxv8bf16(<vscale x 8 x bfloat> *%a, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_load_nxv8bf16:
|
||||
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%load = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat> *%a, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> undef)
|
||||
ret <vscale x 8 x bfloat> %load
|
||||
}
|
||||
|
||||
;
|
||||
; Masked Stores
|
||||
;
|
||||
|
@ -182,6 +190,7 @@ declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32,
|
|||
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
|
||||
declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
|
||||
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
|
||||
declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
|
||||
|
||||
declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
|
||||
|
|
Loading…
Reference in New Issue