[PATCH] [ARM] ARMv8.6-a command-line + BFloat16 Asm Support

Summary: This patch introduces command-line support for the Armv8.6-a architecture and assembly support for BFloat16. Details can be found https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a in addition to the GCC patch for the 8..6-a CLI: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-11/msg02647.html In detail this patch - march options for armv8.6-a - BFloat16 assembly This is part of a patch series, starting with command-line and Bfloat16 assembly support. The subsequent patches will upstream intrinsics support for BFloat16, followed by Matrix Multiplication and the remaining Virtualization features of the armv8.6-a architecture. Based on work by: - labrinea - MarkMurrayARM - Luke Cheeseman - Javed Asbar - Mikhail Maltsev - Luke Geeson Reviewers: SjoerdMeijer, craig.topper, rjmccall, jfb, LukeGeeson Reviewed By: SjoerdMeijer Subscribers: stuij, kristof.beyls, hiraditya, dexonsmith, danielkiss, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D76062
2020-03-26 08:17:29 +00:00 · 2020-03-26 08:17:29 +00:00 · 71ae267d1f
parent 6a946993d5
commit 71ae267d1f
50 changed files with 1650 additions and 16 deletions
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@ -151,6 +151,7 @@ void AArch64TargetInfo::fillValidCPUList(

 void AArch64TargetInfo::getTargetDefinesARMV81A(const LangOptions &Opts,
                                                MacroBuilder &Builder) const {
+  // FIXME: Armv8.1 makes __ARM_FEATURE_CRC32 mandatory. Handle it here.
  Builder.defineMacro("__ARM_FEATURE_QRDMX", "1");
 }

@ -171,17 +172,26 @@ void AArch64TargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
 void AArch64TargetInfo::getTargetDefinesARMV84A(const LangOptions &Opts,
                                                MacroBuilder &Builder) const {
  // Also include the Armv8.3 defines
-  // FIXME: Armv8.4 makes some extensions mandatory. Handle them here.
+  // FIXME: Armv8.4 makes __ARM_FEATURE_ATOMICS, defined in GCC, mandatory.
+  // Add and handle it here.
  getTargetDefinesARMV83A(Opts, Builder);
 }

 void AArch64TargetInfo::getTargetDefinesARMV85A(const LangOptions &Opts,
                                                MacroBuilder &Builder) const {
  // Also include the Armv8.4 defines
-  // FIXME: Armv8.5 makes some extensions mandatory. Handle them here.
  getTargetDefinesARMV84A(Opts, Builder);
 }

+void AArch64TargetInfo::getTargetDefinesARMV86A(const LangOptions &Opts,
+                                                MacroBuilder &Builder) const {
+  // Also include the Armv8.5 defines
+  // FIXME: Armv8.6 makes the following extensions mandatory:
+  // - __ARM_FEATURE_BF16
+  // - __ARM_FEATURE_MATMUL_INT8
+  // Handle them here.
+  getTargetDefinesARMV85A(Opts, Builder);
+}

 void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
                                         MacroBuilder &Builder) const {
@ -290,6 +300,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
  case llvm::AArch64::ArchKind::ARMV8_5A:
    getTargetDefinesARMV85A(Opts, Builder);
    break;
+  case llvm::AArch64::ArchKind::ARMV8_6A:
+    getTargetDefinesARMV86A(Opts, Builder);
+    break;
  }

  // All of the __sync_(bool|val)_compare_and_swap_(1|2|4|8) builtins work.
@ -344,6 +357,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
      ArchKind = llvm::AArch64::ArchKind::ARMV8_4A;
    if (Feature == "+v8.5a")
      ArchKind = llvm::AArch64::ArchKind::ARMV8_5A;
+    if (Feature == "+v8.6a")
+      ArchKind = llvm::AArch64::ArchKind::ARMV8_6A;
    if (Feature == "+fullfp16")
      HasFullFP16 = true;
    if (Feature == "+dotprod")
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@ -70,6 +70,8 @@ public:
                               MacroBuilder &Builder) const;
  void getTargetDefinesARMV85A(const LangOptions &Opts,
                               MacroBuilder &Builder) const;
+  void getTargetDefinesARMV86A(const LangOptions &Opts,
+                               MacroBuilder &Builder) const;
  void getTargetDefines(const LangOptions &Opts,
                        MacroBuilder &Builder) const override;

--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@ -201,6 +201,8 @@ StringRef ARMTargetInfo::getCPUAttr() const {
    return "8_4A";
  case llvm::ARM::ArchKind::ARMV8_5A:
    return "8_5A";
+  case llvm::ARM::ArchKind::ARMV8_6A:
+    return "8_6A";
  case llvm::ARM::ArchKind::ARMV8MBaseline:
    return "8M_BASE";
  case llvm::ARM::ArchKind::ARMV8MMainline:
@ -830,6 +832,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
  case llvm::ARM::ArchKind::ARMV8_3A:
  case llvm::ARM::ArchKind::ARMV8_4A:
  case llvm::ARM::ArchKind::ARMV8_5A:
+  case llvm::ARM::ArchKind::ARMV8_6A:
    getTargetDefinesARMV83A(Opts, Builder);
    break;
  }
--- a/clang/test/Driver/aarch64-cpus.c
+++ b/clang/test/Driver/aarch64-cpus.c
@ -603,6 +603,39 @@
 // RUN: %clang -target aarch64 -march=armv8.5-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-FP16 %s
 // GENERICV85A-FP16: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v8.5a" "-target-feature" "+fullfp16"

+// RUN: %clang -target aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A %s
+// RUN: %clang -target aarch64 -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A %s
+// RUN: %clang -target aarch64 -mlittle-endian -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A %s
+// RUN: %clang -target aarch64 -mlittle-endian -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A %s
+// RUN: %clang -target aarch64_be -mlittle-endian -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A %s
+// RUN: %clang -target aarch64_be -mlittle-endian -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A %s
+// GENERICV86A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v8.6a"
+
+// RUN: %clang -target aarch64_be -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
+// RUN: %clang -target aarch64_be -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV86A-BE %s
+// GENERICV86A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v8.6a"
+
+// The SVE extension is an optional extension for Armv8-A.
+// RUN: %clang -target aarch64 -march=armv8a+sve -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-SVE %s
+// RUN: %clang -target aarch64 -march=armv8.6a+sve -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-SVE %s
+// GENERICV8A-SVE: "-target-feature" "+sve"
+// RUN: %clang -target aarch64 -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-NOSVE %s
+// RUN: %clang -target aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-NOSVE %s
+// GENERICV8A-NOSVE-NOT: "-target-feature" "+sve"
+
+// The BFloat16 extension is a mandatory component of the Armv8.6-A extensions, but is permitted as an
+// optional feature for any implementation of Armv8.2-A to Armv8.5-A (inclusive)
+// RUN: %clang -target aarch64 -march=armv8.5a+bf16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-BF16 %s
+// GENERICV85A-BF16: "-target-feature" "+bf16"
+// RUN: %clang -target aarch64 -march=armv8.5a+bf16+nobf16 -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV85A-BF16-NO-BF16 %s
+// GENERICV85A-BF16-NO-BF16: "-target-feature" "-bf16"
+// RUN: %clang -target aarch64 -march=armv8.5a+bf16+sve -### -c %s 2>&1 | FileCheck -check-prefixes=GENERICV85A-BF16-SVE %s
+// GENERICV85A-BF16-SVE: "-target-feature" "+bf16" "-target-feature" "+sve"
+
 // fullfp16 is off by default for v8a, feature must not be mentioned
 // RUN: %clang -target aarch64 -march=armv8a  -### -c %s 2>&1 | FileCheck -check-prefix=V82ANOFP16 -check-prefix=GENERIC %s
 // RUN: %clang -target aarch64 -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=V82ANOFP16 -check-prefix=GENERIC %s
--- a/clang/test/Driver/arm-cortex-cpus.c
+++ b/clang/test/Driver/arm-cortex-cpus.c
@ -335,6 +335,23 @@
 // RUN: %clang -target arm -march=armebv8.5-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V85A %s
 // CHECK-BE-V85A: "-cc1"{{.*}} "-triple" "armebv8.5{{.*}}" "-target-cpu" "generic"

+// RUN: %clang -target armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// RUN: %clang -target arm -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// RUN: %clang -target arm -march=armv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// RUN: %clang -target arm -march=armv8.6a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// RUN: %clang -target armv8.6a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// RUN: %clang -target arm -march=armv8.6a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// RUN: %clang -target arm -mlittle-endian -march=armv8.6-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V86A %s
+// CHECK-V86A: "-cc1"{{.*}} "-triple" "armv8.6{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V86A %s
+// RUN: %clang -target armv8.6a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V86A %s
+// RUN: %clang -target armeb -march=armebv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V86A %s
+// RUN: %clang -target armeb -march=armebv8.6-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V86A %s
+// RUN: %clang -target arm -march=armebv8.6a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V86A %s
+// RUN: %clang -target arm -march=armebv8.6-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V86A %s
+// CHECK-BE-V86A: "-cc1"{{.*}} "-triple" "armebv8.6{{.*}}" "-target-cpu" "generic"
+
 // Once we have CPUs with optional v8.2-A FP16, we will need a way to turn it
 // on and off. Cortex-A53 is a placeholder for now.
 // RUN: %clang -target armv8a-linux-eabi -mcpu=cortex-a53+fp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-CORTEX-A53-FP16 %s
@ -432,6 +449,9 @@
 // RUN: %clang -target armv8a-linux-eabi -march=armv8.5-a+fp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-V85A-FP16 %s
 // CHECK-V85A-FP16: "-cc1"{{.*}} "-triple" "armv8.5{{.*}}" "-target-cpu" "generic" {{.*}}"-target-feature" "+fullfp16"

+// RUN: %clang -target armv8a-linux-eabi -march=armv8.6-a+bf16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-V86A-BF16 %s
+// CHECK-V86A-BF16: "-cc1"{{.*}} "-triple" "armv8.6{{.*}}" "-target-cpu" "generic" {{.*}}"-target-feature" "+bf16"
+
 // RUN: %clang -target arm -march=armv8.2-a+fp16 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-FULLFP16-SOFT %s
 // RUN: %clang -target arm -march=armv8.2-a+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-FULLFP16-SOFT %s
 // RUN: %clang -target arm -march=armv8.2-a+fp16+fp16fml -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-FULLFP16-SOFT %s
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@ -841,5 +841,10 @@
 // CHECK-V85A: #define __ARM_ARCH_8_5A__ 1
 // CHECK-V85A: #define __ARM_ARCH_PROFILE 'A'

+// RUN: %clang -target armv8.6a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V86A %s
+// CHECK-V86A: #define __ARM_ARCH 8
+// CHECK-V86A: #define __ARM_ARCH_8_6A__ 1
+// CHECK-V86A: #define __ARM_ARCH_PROFILE 'A'
+
 // RUN: %clang -target arm-none-none-eabi -march=armv7-m -mfpu=softvfp -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SOFTVFP %s
 // CHECK-SOFTVFP-NOT: #define __ARM_FP 0x
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@ -101,6 +101,7 @@ public:
  enum SubArchType {
    NoSubArch,

+    ARMSubArch_v8_6a,
    ARMSubArch_v8_5a,
    ARMSubArch_v8_4a,
    ARMSubArch_v8_3a,
--- a/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/llvm/include/llvm/Support/AArch64TargetParser.def
@ -44,6 +44,13 @@ AARCH64_ARCH("armv8.5-a", ARMV8_5A, "8.5-A", "v8.5a",
             (AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
              AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
              AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD))
+AARCH64_ARCH("armv8.6-a", ARMV8_6A, "8.6-A", "v8.6a",
+             ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC  | AArch64::AEK_FP   |
+              AArch64::AEK_SIMD | AArch64::AEK_RAS  | AArch64::AEK_LSE     |
+              AArch64::AEK_RDM  | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
+              AArch64::AEK_SM4  | AArch64::AEK_SHA3 | AArch64::AEK_BF16    |
+              AArch64::AEK_SHA2 | AArch64::AEK_AES  | AArch64::AEK_I8MM))
 #undef AARCH64_ARCH

 #ifndef AARCH64_ARCH_EXT_NAME
@ -79,6 +86,8 @@ AARCH64_ARCH_EXT_NAME("memtag",       AArch64::AEK_MTE,         "+mte",   "-mte"
 AARCH64_ARCH_EXT_NAME("ssbs",         AArch64::AEK_SSBS,        "+ssbs",  "-ssbs")
 AARCH64_ARCH_EXT_NAME("sb",           AArch64::AEK_SB,          "+sb",    "-sb")
 AARCH64_ARCH_EXT_NAME("predres",      AArch64::AEK_PREDRES,     "+predres", "-predres")
+AARCH64_ARCH_EXT_NAME("bf16",         AArch64::AEK_BF16,        "+bf16",  "-bf16")
+AARCH64_ARCH_EXT_NAME("i8mm",         AArch64::AEK_I8MM,        "+i8mm",  "-i8mm")
 AARCH64_ARCH_EXT_NAME("tme",          AArch64::AEK_TME,         "+tme",   "-tme")
 #undef AARCH64_ARCH_EXT_NAME

--- a/llvm/include/llvm/Support/AArch64TargetParser.h
+++ b/llvm/include/llvm/Support/AArch64TargetParser.h
@ -55,6 +55,8 @@ enum ArchExtKind : unsigned {
  AEK_SVE2SHA3 =    1 << 26,
  AEK_SVE2BITPERM = 1 << 27,
  AEK_TME =         1 << 28,
+  AEK_BF16 =        1 << 29,
+  AEK_I8MM =        1 << 30,
 };

 enum class ArchKind {
--- a/llvm/include/llvm/Support/ARMTargetParser.def
+++ b/llvm/include/llvm/Support/ARMTargetParser.def
@ -112,6 +112,11 @@ ARM_ARCH("armv8.5-a", ARMV8_5A, "8.5-A", "v8.5a",
         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
          ARM::AEK_DOTPROD))
+ARM_ARCH("armv8.6-a", ARMV8_6A, "8.6-A", "v8.6a",
+         ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+         (ARM::AEK_SEC        | ARM::AEK_MP   | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP  | ARM::AEK_CRC  | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD    | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "v8r", ARMBuildAttrs::CPUArch::v8_R,
          FK_NEON_FP_ARMV8,
          (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB |
@ -164,6 +169,7 @@ ARM_ARCH_EXT_NAME("iwmmxt2",  ARM::AEK_IWMMXT2,  nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("maverick", ARM::AEK_MAVERICK, nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("xscale",   ARM::AEK_XSCALE,   nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("fp16fml",  ARM::AEK_FP16FML,  "+fp16fml", "-fp16fml")
+ARM_ARCH_EXT_NAME("bf16",     ARM::AEK_BF16,     "+bf16",    "-bf16")
 ARM_ARCH_EXT_NAME("sb",       ARM::AEK_SB,       "+sb",      "-sb")
 ARM_ARCH_EXT_NAME("lob",      ARM::AEK_LOB,      "+lob",   "-lob")
 ARM_ARCH_EXT_NAME("cdecp0",   ARM::AEK_CDECP0,   "+cdecp0",  "-cdecp0")
--- a/llvm/include/llvm/Support/ARMTargetParser.h
+++ b/llvm/include/llvm/Support/ARMTargetParser.h
@ -46,14 +46,15 @@ enum ArchExtKind : uint64_t {
  AEK_SB      =     1 << 17,
  AEK_FP_DP   =     1 << 18,
  AEK_LOB     =     1 << 19,
-  AEK_CDECP0 =      1 << 20,
-  AEK_CDECP1 =      1 << 21,
-  AEK_CDECP2 =      1 << 22,
-  AEK_CDECP3 =      1 << 23,
-  AEK_CDECP4 =      1 << 24,
-  AEK_CDECP5 =      1 << 25,
-  AEK_CDECP6 =      1 << 26,
-  AEK_CDECP7 =      1 << 27,
+  AEK_BF16    =     1 << 20,
+  AEK_CDECP0 =      1 << 21,
+  AEK_CDECP1 =      1 << 22,
+  AEK_CDECP2 =      1 << 23,
+  AEK_CDECP3 =      1 << 24,
+  AEK_CDECP4 =      1 << 25,
+  AEK_CDECP5 =      1 << 26,
+  AEK_CDECP6 =      1 << 27,
+  AEK_CDECP7 =      1 << 28,

  // Unsupported extensions.
  AEK_OS       =    1ULL << 59,
--- a/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/llvm/lib/Support/AArch64TargetParser.cpp
@ -116,6 +116,8 @@ bool AArch64::getArchFeatures(AArch64::ArchKind AK,
    Features.push_back("+v8.4a");
  if (AK == ArchKind::ARMV8_5A)
    Features.push_back("+v8.5a");
+  if (AK == AArch64::ArchKind::ARMV8_6A)
+    Features.push_back("+v8.6a");

  return AK != ArchKind::INVALID;
 }
--- a/llvm/lib/Support/ARMTargetParser.cpp
+++ b/llvm/lib/Support/ARMTargetParser.cpp
@ -74,6 +74,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
  case ArchKind::ARMV8_3A:
  case ArchKind::ARMV8_4A:
  case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV8_6A:
  case ArchKind::ARMV8R:
  case ArchKind::ARMV8MBaseline:
  case ArchKind::ARMV8MMainline:
@ -108,6 +109,7 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
  case ArchKind::ARMV8_3A:
  case ArchKind::ARMV8_4A:
  case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV8_6A:
    return ProfileKind::A;
  case ArchKind::ARMV2:
  case ArchKind::ARMV2A:
@ -150,6 +152,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
      .Case("v8.3a", "v8.3-a")
      .Case("v8.4a", "v8.4-a")
      .Case("v8.5a", "v8.5-a")
+      .Case("v8.6a", "v8.6-a")
      .Case("v8r", "v8-r")
      .Case("v8m.base", "v8-m.base")
      .Case("v8m.main", "v8-m.main")
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@ -627,6 +627,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
    return Triple::ARMSubArch_v8_4a;
  case ARM::ArchKind::ARMV8_5A:
    return Triple::ARMSubArch_v8_5a;
+  case ARM::ArchKind::ARMV8_6A:
+    return Triple::ARMSubArch_v8_6a;
  case ARM::ArchKind::ARMV8R:
    return Triple::ARMSubArch_v8r;
  case ARM::ArchKind::ARMV8MBaseline:
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@ -365,6 +365,9 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
    "true", "Use an instruction sequence for taking the address of a global "
    "that allows a memory tag in the upper address bits">;

+def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
+    "true", "Enable BFloat16 Extension" >;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@ -391,8 +394,11 @@ def HasV8_5aOps : SubtargetFeature<
  "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
  [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict,
   FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist,
-   FeatureBranchTargetId]
->;
+   FeatureBranchTargetId]>;
+
+def HasV8_6aOps : SubtargetFeature<
+  "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
+  [HasV8_5aOps, FeatureBF16]>;

 //===----------------------------------------------------------------------===//
 // Register File Description
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@ -7786,6 +7786,110 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
  let Inst{4-0}   = Rd;
 }

+
+//----------------------------------------------------------------------------
+// Armv8.6 BFloat16 Extension
+//----------------------------------------------------------------------------
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {
+
+class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
+                                   string kind2, RegisterOperand RegType,
+                                   ValueType AccumType, ValueType InputType>
+  : BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, []> {
+  let AsmString = !strconcat(asm,
+                             "{\t$Rd" # kind1 # ", $Rn" # kind2 #
+                               ", $Rm" # kind2 # "}");
+}
+
+multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
+  def v4f16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
+                                           v2f32, v8i8>;
+  def v8f16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
+                                           v4f32, v16i8>;
+}
+
+class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
+                                      string dst_kind, string lhs_kind,
+                                      string rhs_kind,
+                                      RegisterOperand RegType,
+                                      ValueType AccumType,
+                                      ValueType InputType>
+  : BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
+                        RegType, RegType, V128, VectorIndexS,
+                        asm, "", dst_kind, lhs_kind, rhs_kind,
+        []> {
+
+  bits<2> idx;
+  let Inst{21}    = idx{0};  // L
+  let Inst{11}    = idx{1};  // H
+}
+
+multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
+
+  def v4f16  : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
+                                               ".2h", V64, v2f32, v8i8>;
+  def v8f16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
+                                              ".2h", V128, v4f32, v16i8>;
+}
+
+class SIMDBF16MLAL<bit Q, string asm>
+  : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
+              []> { // TODO: Add intrinsics
+  let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
+}
+
+class SIMDBF16MLALIndex<bit Q, string asm>
+  : I<(outs V128:$dst),
+      (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
+      "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
+          []>, // TODO: Add intrinsics
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<4> Rm;
+  bits<3> idx;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-22} = 0b00111111;
+  let Inst{21-20} = idx{1-0};
+  let Inst{19-16} = Rm;
+  let Inst{15-12} = 0b1111;
+  let Inst{11}    = idx{2};   // H
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDThreeSameVectorBF16MatrixMul<string asm>
+  : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
+                                V128, asm, ".4s",
+                          []> {
+  let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
+                                    ", $Rm", ".8h", "}");
+}
+
+class SIMD_BFCVTN
+  : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+                           "bfcvtn", ".4h", ".4s",
+    []>;
+
+class SIMD_BFCVTN2
+  : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
+                           "bfcvtn2", ".8h", ".4s",
+    []>;
+
+class BF16ToSinglePrecision<string asm>
+  : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", []>,
+    Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-10} = 0b0001111001100011010000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
+
 // ARMv8.2-A Dot Product Instructions (Indexed)
 class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
                                      string lhs_kind, string rhs_kind,
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@ -23,6 +23,8 @@ def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
                                 AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
 def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
                                 AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
+                                 AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
 def HasVH            : Predicate<"Subtarget->hasVH()">,
                       AssemblerPredicate<(all_of FeatureVH), "vh">;

@ -142,6 +144,8 @@ def HasETE           : Predicate<"Subtarget->hasETE()">,
                       AssemblerPredicate<(all_of FeatureETE), "ete">;
 def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
                       AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
+def HasBF16          : Predicate<"Subtarget->hasBF16()">,
+                       AssemblerPredicate<(all_of FeatureBF16), "bf16">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@ -746,6 +750,20 @@ defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
 defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
 }

+// ARMv8.6-A BFloat
+let Predicates = [HasBF16] in {
+defm BFDOT       : SIMDThreeSameVectorBFDot<1, "bfdot">;
+defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;
+def BFMMLA       : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
+def BFMLALB      : SIMDBF16MLAL<0, "bfmlalb">;
+def BFMLALT      : SIMDBF16MLAL<1, "bfmlalt">;
+def BFMLALBIdx   : SIMDBF16MLALIndex<0, "bfmlalb">;
+def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt">;
+def BFCVTN       : SIMD_BFCVTN;
+def BFCVTN2      : SIMD_BFCVTN2;
+def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
+}
+
 // ARMv8.2-A FP16 Fused Multiply-Add Long
 let Predicates = [HasNEON, HasFP16FML] in {
 defm FMLAL      : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -1197,6 +1197,18 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
  defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
  defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  int_aarch64_sve_fsqrt>;

+  let Predicates = [HasBF16, HasSVE] in {
+    def BFDOT_ZZZ    : sve_bfloat_dot<"bfdot">;
+    def BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot">;
+    def BFMMLA_ZZZ  : sve_bfloat_matmul<"bfmmla">;
+    def BFMMLA_B_ZZZ  : sve_bfloat_matmul_longvecl<0b0, "bfmlalb">;
+    def BFMMLA_T_ZZZ  : sve_bfloat_matmul_longvecl<0b1, "bfmlalt">;
+    def BFMMLA_B_ZZI  : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb">;
+    def BFMMLA_T_ZZI  : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt">;
+    def BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt">;
+    def BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt">;
+  }
+
  // InstAliases
  def : InstAlias<"mov $Zd, $Zn",
                  (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@ -76,6 +76,7 @@ protected:
  bool HasV8_3aOps = false;
  bool HasV8_4aOps = false;
  bool HasV8_5aOps = false;
+  bool HasV8_6aOps = false;

  bool HasFPARMv8 = false;
  bool HasNEON = false;
@ -144,6 +145,9 @@ protected:
  bool HasMTE = false;
  bool HasTME = false;

+  // Armv8.6-A Extensions
+  bool HasBF16 = false;
+
  // Arm SVE2 extensions
  bool HasSVE2AES = false;
  bool HasSVE2SM4 = false;
@ -403,6 +407,9 @@ public:
  bool hasSVE2SHA3() const { return HasSVE2SHA3; }
  bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }

+  // Armv8.6-A Extensions
+  bool hasBF16() const { return HasBF16; }
+
  bool isLittleEndian() const { return IsLittle; }

  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@ -2859,6 +2859,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
    Str += "ARMv8.4a";
  else if (FBS[AArch64::HasV8_5aOps])
    Str += "ARMv8.5a";
+  else if (FBS[AArch64::HasV8_6aOps])
+    Str += "ARMv8.6a";
  else {
    auto ext = std::find_if(std::begin(ExtensionMap),
      std::end(ExtensionMap),
@ -5094,6 +5096,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
      break;
    case AArch64::ArchKind::ARMV8_4A:
    case AArch64::ArchKind::ARMV8_5A:
+    case AArch64::ArchKind::ARMV8_6A:
      RequestedExtensions.push_back("sm4");
      RequestedExtensions.push_back("sha3");
      RequestedExtensions.push_back("sha2");
@ -5113,6 +5116,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
      break;
    case AArch64::ArchKind::ARMV8_4A:
    case AArch64::ArchKind::ARMV8_5A:
+    case AArch64::ArchKind::ARMV8_6A:
      RequestedExtensions.push_back("nosm4");
      RequestedExtensions.push_back("nosha3");
      RequestedExtensions.push_back("nosha2");
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@ -7394,6 +7394,96 @@ multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> {
  def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME)>;
 }

+//===----------------------------------------------------------------------===//
+// SVE BFloat16 Group
+//===----------------------------------------------------------------------===//
+
+class sve_bfloat_dot_base<bits<2> opc, string asm, string ops, dag iops>
+: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  let Inst{31-21} = 0b01100100011;
+  let Inst{15-14} = opc;
+  let Inst{13-10} = 0b0000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ElementSizeH;
+}
+
+class sve_bfloat_dot<string asm>
+: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm",
+  (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> {
+  bits<5> Zm;
+  let Inst{20-16} = Zm;
+}
+
+class sve_bfloat_dot_indexed<string asm>
+: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
+  (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> {
+  bits<2> iop;
+  bits<3> Zm;
+  let Inst{20-19} = iop;
+  let Inst{18-16} = Zm;
+}
+
+class sve_bfloat_matmul<string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
+  asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zm;
+  bits<5> Zda;
+  bits<5> Zn;
+  let Inst{31-21} = 0b01100100011;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b111001;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ElementSizeH;
+}
+
+class sve_bfloat_matmul_longvecl<bit BT, string asm>
+: sve_bfloat_matmul<asm> {
+  let Inst{23}    = 0b1;
+  let Inst{14-13} = 0b00;
+  let Inst{10}    = BT;
+}
+
+class sve_bfloat_matmul_longvecl_idx<bit BT, string asm>
+: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
+  (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> {
+  bits<3> iop;
+  bits<3> Zm;
+  let Inst{23}    = 0b1;
+  let Inst{20-19} = iop{2-1};
+  let Inst{18-16} = Zm;
+  let Inst{11}    = iop{0};
+  let Inst{10}    = BT;
+}
+
+class sve_bfloat_convert<bit N, string asm>
+: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
+  asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<3> Pg;
+  bits<5> Zn;
+  let Inst{31-25} = 0b0110010;
+  let Inst{24}    = N;
+  let Inst{23-13} = 0b10001010101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+  let DestructiveInstType = DestructiveOther;
+  let hasSideEffects = 1;
+  let ElementSize = ElementSizeS;
+}
+
 /// Addressing modes
 def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
 def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@ -424,6 +424,10 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
 def FeatureSB       : SubtargetFeature<"sb", "HasSB", "true",
  "Enable v8.5a Speculation Barrier" >;

+// Armv8.6-A extensions
+def FeatureBF16     : SubtargetFeature<"bf16", "HasBF16", "true",
+  "Enable support for BFloat16 instructions",  [FeatureNEON]>;
+
 // Armv8.1-M extensions

 def FeatureLOB            : SubtargetFeature<"lob", "HasLOB", "true",
@ -523,6 +527,10 @@ def HasV8_5aOps   : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true",
                                   "Support ARM v8.5a instructions",
                                   [HasV8_4aOps, FeatureSB]>;

+def HasV8_6aOps   : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true",
+                                   "Support ARM v8.6a instructions",
+                                   [HasV8_5aOps, FeatureBF16]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
               "v8.1m.main", "HasV8_1MMainlineOps", "true",
               "Support ARM v8-1M Mainline instructions",
@ -797,6 +805,19 @@ def ARMv85a   : Architecture<"armv8.5-a", "ARMv85a",  [HasV8_5aOps,
                                                       FeatureCRC,
                                                       FeatureRAS,
                                                       FeatureDotProd]>;
+def ARMv86a   : Architecture<"armv8.6-a", "ARMv86a",  [HasV8_6aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;

 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                       FeatureRClass,
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@ -8926,3 +8926,93 @@ def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
                     (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
 def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
                     (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+
+// ARMv8.6a BFloat16 instructions.
+let Predicates = [HasBF16, HasNEON] in {
+class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6,
+               dag oops, dag iops>
+   : N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops,
+           N3RegFrm, IIC_VDOTPROD, "", "", []> {
+    let hasNoSchedulingInfo = 1;
+    let DecoderNamespace = "VFPV8";
+}
+
+class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy>
+   : BF16VDOT<0b11000, 0b00,  Q, (outs RegTy:$dst),
+              (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm)> {
+  let Constraints = "$dst = $Vd";
+  let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
+    let DecoderNamespace = "VFPV8";
+}
+
+multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
+                     ValueType InputTy, dag RHS> {
+
+  def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst),
+                    (ins RegTy:$Vd, RegTy:$Vn,
+                    DPR_VFP2:$Vm, VectorIndex32:$lane)> {
+    bit lane;
+    let Inst{5} = lane;
+    let Constraints = "$dst = $Vd";
+    let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm$lane");
+    let DecoderNamespace = "VFPV8";
+  }
+
+}
+
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+
+class BF16MM<bit Q, RegisterClass RegTy,
+             string opc>
+   : N3Vnp<0b11000, 0b00, 0b1100, Q, 0,
+           (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+           N3RegFrm, IIC_VDOTPROD, "", "", []> {
+   let Constraints = "$dst = $Vd";
+   let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
+   let DecoderNamespace = "VFPV8";
+   let hasNoSchedulingInfo = 1;
+}
+
+def VMMLA : BF16MM<1, QPR, "vmmla">;
+
+class VBF16MALQ<bit T, string suffix>
+  : N3VCP8<0b00, 0b11, T, 1,
+           (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),
+           NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
+              []> { // TODO: Add intrinsics
+  let Constraints = "$dst = $Vd";
+  let DecoderNamespace = "VFPV8";
+  let hasNoSchedulingInfo = 1;
+}
+
+def VBF16MALTQ: VBF16MALQ<1, "t">;
+def VBF16MALBQ: VBF16MALQ<0, "b">;
+
+multiclass VBF16MALQI<bit T, string suffix> {
+  def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),
+              (ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
+               IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {
+  bits<2> idx;
+  let Inst{5} = idx{1};
+  let Inst{3} = idx{0};
+  let Constraints = "$dst = $Vd";
+  let DecoderNamespace = "VFPV8";
+  let hasNoSchedulingInfo = 1;
+  }
+
+}
+
+defm VBF16MALTQI: VBF16MALQI<1, "t">;
+defm VBF16MALBQI: VBF16MALQI<0, "b">;
+
+let hasNoSchedulingInfo = 1 in {
+def BF16_VCVT :  N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,
+                    (outs DPR:$Vd), (ins QPR:$Vm),
+                    NoItinerary, "vcvt", "bf16.f32", "$Vd, $Vm", "", []>;
+}
+}
+// End of BFloat16 instructions
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@ -1867,6 +1867,35 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,

 } // End of 'let Constraints = "$a = $dst" in'

+// BFloat16  - Single precision, unary, predicated
+class BF16_VCVT<string opc, bits<2> op7_6>
+   : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm),
+           VFPUnaryFrm, NoItinerary,
+           opc, ".bf16.f32\t$Sd, $Sm", []>,
+      RegConstraint<"$dst = $Sd">,
+      Requires<[HasBF16]>,
+     Sched<[]> {
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = 0b11101; // opcode1
+  let Inst{21-20} = 0b11;    // opcode2
+  let Inst{19-16} = 0b0011;  // opcode3
+  let Inst{11-8}  = 0b1001;
+  let Inst{7-6}   = op7_6;
+  let Inst{4}     = 0;
+  let DecoderNamespace = "VFPV8";
+}
+
+def BF16_VCVTB : BF16_VCVT<"vcvtb", 0b01>;
+def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>;
+
 //===----------------------------------------------------------------------===//
 // FP Multiply-Accumulate Operations.
 //
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@ -72,6 +72,8 @@ def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
                                 AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
 def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
                                 AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
+                                 AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2Base()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2Base()">,
                                 AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">;
@ -106,6 +108,8 @@ def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                 AssemblerPredicate<(all_of FeatureFullFP16),"full half-float">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
                                 AssemblerPredicate<(all_of FeatureFP16FML),"full half-float fml">;
+def HasBF16          : Predicate<"Subtarget->hasBF16()">,
+                                 AssemblerPredicate<(all_of FeatureBF16),"BFloat16 floating point extension">;
 def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
                                 AssemblerPredicate<(all_of FeatureHWDivThumb), "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@ -108,6 +108,7 @@ protected:
    ARMv83a,
    ARMv84a,
    ARMv85a,
+    ARMv86a,
    ARMv8a,
    ARMv8mBaseline,
    ARMv8mMainline,
@ -157,6 +158,7 @@ protected:
  bool HasV8_3aOps = false;
  bool HasV8_4aOps = false;
  bool HasV8_5aOps = false;
+  bool HasV8_6aOps = false;
  bool HasV8MBaselineOps = false;
  bool HasV8MMainlineOps = false;
  bool HasV8_1MMainlineOps = false;
@ -255,6 +257,9 @@ protected:
  /// HasFP16FML - True if subtarget supports half-precision FP fml operations
  bool HasFP16FML = false;

+  /// HasBF16 - True if subtarget supports BFloat16 floating point operations
+  bool HasBF16 = false;
+
  /// HasD32 - True if subtarget has the full 32 double precision
  /// FP registers for VFPv3.
  bool HasD32 = false;
@ -581,6 +586,7 @@ public:
  bool hasV8_3aOps() const { return HasV8_3aOps; }
  bool hasV8_4aOps() const { return HasV8_4aOps; }
  bool hasV8_5aOps() const { return HasV8_5aOps; }
+  bool hasV8_6aOps() const { return HasV8_6aOps; }
  bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
  bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
  bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@ -6322,6 +6322,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
      Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
      Mnemonic == "bxns"  || Mnemonic == "blxns" ||
+      Mnemonic == "vdot"  || Mnemonic == "vmmla"  ||
      Mnemonic == "vudot" || Mnemonic == "vsdot" ||
      Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
      Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
@ -6462,6 +6463,8 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
      Mnemonic == "vudot" || Mnemonic == "vsdot" ||
      Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
      Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
+      Mnemonic == "vfmat" || Mnemonic == "vfmab" ||
+      Mnemonic == "vdot"  || Mnemonic == "vmmla" ||
      Mnemonic == "sb"    || Mnemonic == "ssbb"  ||
      Mnemonic == "pssbb" ||
      Mnemonic == "bfcsel" || Mnemonic == "wls" ||
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@ -856,6 +856,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
  case ARM::ArchKind::ARMV8_3A:
  case ARM::ArchKind::ARMV8_4A:
  case ARM::ArchKind::ARMV8_5A:
+  case ARM::ArchKind::ARMV8_6A:
    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
    setAttributeItem(ARM_ISA_use, Allowed, false);
    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
--- a/llvm/test/MC/AArch64/SVE/bfcvt-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvt-diagnostics.s
@ -0,0 +1,27 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+sve,bf16 2>&1 < %s| FileCheck %s
+
+bfcvt z0.s, p0/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfcvt z0.s, p0/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvt z0.h, p0/m, z1.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfcvt z0.h, p0/m, z1.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvt z0.h, p0/z, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfcvt z0.h, p0/z, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvt z0.h, p8/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: bfcvt z0.h, p8/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.h, p0/m, z7.h
+bfcvt z0.h, p0/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx with a different element size
+// CHECK-NEXT: bfcvt z0.h, p0/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
--- a/llvm/test/MC/AArch64/SVE/bfcvt.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvt.s
@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+bf16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+bfcvt z0.H, p0/m, z1.S
+// CHECK-INST: bfcvt z0.h, p0/m, z1.s
+// CHECK-ENCODING: [0x20,0xa0,0x8a,0x65]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0.S, p0/m, z2.S
+// CHECK-INST: movprfx z0.s, p0/m, z2.s
+// CHECK-ENCODING: [0x40,0x20,0x91,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfcvt z0.H, p0/m, z1.S
+// CHECK-INST: bfcvt z0.h, p0/m, z1.s
+// CHECK-ENCODING: [0x20,0xa0,0x8a,0x65]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z2
+// CHECK-INST: movprfx z0, z2
+// CHECK-ENCODING: [0x40,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfcvt z0.H, p0/m, z1.S
+// CHECK-INST: bfcvt z0.h, p0/m, z1.s
+// CHECK-ENCODING: [0x20,0xa0,0x8a,0x65]
+// CHECK-ERROR: instruction requires: bf16 sve
--- a/llvm/test/MC/AArch64/SVE/bfcvtnt-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvtnt-diagnostics.s
@ -0,0 +1,27 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+sve,bf16  2>&1 < %s| FileCheck %s
+
+bfcvtnt z0.s, p0/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfcvtnt z0.s, p0/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvtnt z0.h, p0/m, z1.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvtnt z0.h, p0/z, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfcvtnt z0.h, p0/z, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvtnt z0.h, p8/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: bfcvtnt z0.h, p8/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.h, p0/m, z7.h
+bfcvtnt z0.h, p0/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx with a different element size
+// CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
--- a/llvm/test/MC/AArch64/SVE/bfcvtnt.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvtnt.s
@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+bf16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+bfcvtnt z0.H, p0/m, z1.S
+// CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
+// CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0.S, p0/m, z2.S
+// CHECK-INST: movprfx z0.s, p0/m, z2.s
+// CHECK-ENCODING: [0x40,0x20,0x91,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfcvtnt z0.H, p0/m, z1.S
+// CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
+// CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z2
+// CHECK-INST: movprfx z0, z2
+// CHECK-ENCODING: [0x40,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfcvtnt z0.H, p0/m, z1.S
+// CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
+// CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
--- a/llvm/test/MC/AArch64/SVE/bfdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfdot-diagnostics.s
@ -0,0 +1,53 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+sve,bf16  2>&1 < %s| FileCheck %s
+
+bfdot z0.s, z1.s, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfdot z0.s, z1.s, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.h, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfdot z0.h, z1.h, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.s, z1.h, z2.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.h..z7.h
+// CHECK-NEXT: bfdot z0.s, z1.h, z2.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.s, p0/m, z7.s
+bfdot z0.s, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: bfdot z0.s, z1.h, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.s, z1.s, z2.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfdot z0.s, z1.s, z2.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.h, z1.h, z2.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfdot z0.h, z1.h, z2.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.s, z1.h, z2.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.h..z7.h
+// CHECK-NEXT: bfdot z0.s, z1.h, z2.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.s, z1.h, z8.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfdot z0.s, z1.h, z8.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfdot z0.s, z1.h, z2.h[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: bfdot z0.s, z1.h, z2.h[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.s, p0/m, z7.s
+bfdot z0.s, z1.h, z2.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: bfdot z0.s, z1.h, z2.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
--- a/llvm/test/MC/AArch64/SVE/bfdot.s
+++ b/llvm/test/MC/AArch64/SVE/bfdot.s
@ -0,0 +1,52 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+bf16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+bfdot z0.S, z1.H, z2.H
+// CHECK-INST: bfdot z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0x80,0x62,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfdot z0.S, z1.H, z2.H[0]
+// CHECK-INST: bfdot z0.s, z1.h, z2.h[0]
+// CHECK-ENCODING: [0x20,0x40,0x62,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfdot z0.S, z1.H, z2.H[3]
+// CHECK-INST: bfdot z0.s, z1.h, z2.h[3]
+// CHECK-ENCODING: [0x20,0x40,0x7a,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+// --------------------------------------------------------------------------//
+// Test compatibility with MOVPRFX instruction.
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfdot z0.S, z1.H, z2.H
+// CHECK-INST: bfdot z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0x80,0x62,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfdot z0.S, z1.H, z2.H[0]
+// CHECK-INST: bfdot z0.s, z1.h, z2.h[0]
+// CHECK-ENCODING: [0x20,0x40,0x62,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfdot z0.S, z1.H, z2.H[3]
+// CHECK-INST: bfdot z0.s, z1.h, z2.h[3]
+// CHECK-ENCODING: [0x20,0x40,0x7a,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
--- a/llvm/test/MC/AArch64/SVE/bfmlal-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfmlal-diagnostics.s
@ -0,0 +1,42 @@
+// RUN: not llvm-mc -o - -triple=aarch64 -mattr=+sve,bf16  2>&1 %s | FileCheck %s
+
+bfmlalb z0.S, z1.H, z7.H[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: bfmlalb z0.S, z1.H, z7.H[8]
+// CHECK-NEXT: ^
+
+bfmlalb z0.S, z1.H, z8.H[7]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfmlalb z0.S, z1.H, z8.H[7]
+// CHECK-NEXT: ^
+
+bfmlalt z0.S, z1.H, z7.H[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: bfmlalt z0.S, z1.H, z7.H[8]
+// CHECK-NEXT: ^
+
+bfmlalt z0.S, z1.H, z8.H[7]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfmlalt z0.S, z1.H, z8.H[7]
+// CHECK-NEXT: ^
+
+bfmlalt z0.S, z1.H, z7.2h[2]
+// CHECK: error: invalid vector kind qualifier
+// CHECK-NEXT: bfmlalt z0.S, z1.H, z7.2h[2]
+// CHECK-NEXT:                     ^
+
+bfmlalt z0.S, z1.H, z2.s[2]
+// CHECK: error: Invalid restricted vector register, expected z0.h..z7.h
+// CHECK-NEXT: bfmlalt z0.S, z1.H, z2.s[2]
+// CHECK-NEXT:                     ^
+
+bfmlalt z0.S, z1.s, z2.h[2]
+// CHECK: error: invalid element width
+// CHECK-NEXT: bfmlalt z0.S, z1.s, z2.h[2]
+// CHECK-NEXT:               ^
+
+movprfx z0.s, p0/m, z7.s
+bfmlalt z0.s, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx
+// CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h
+// CHECK-NEXT: ^
--- a/llvm/test/MC/AArch64/SVE/bfmlal.s
+++ b/llvm/test/MC/AArch64/SVE/bfmlal.s
@ -0,0 +1,157 @@
+// RUN: llvm-mc -o - -triple=aarch64 -show-encoding -mattr=+sve,+bf16 %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -o - -triple=aarch64 -show-encoding %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+bfmlalb z0.S, z1.H, z2.H
+// CHECK-INST: bfmlalb z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0x80,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalt z0.S, z1.H, z2.H
+// CHECK-INST: bfmlalt z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0x84,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalb z0.S, z1.H, z2.H[0]
+// CHECK-INST: bfmlalb z0.s, z1.h, z2.h[0]
+// CHECK-ENCODING: [0x20,0x40,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalt z0.S, z1.H, z2.H[0]
+// CHECK-INST: bfmlalt z0.s, z1.h, z2.h[0]
+// CHECK-ENCODING: [0x20,0x44,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalb z0.S, z1.H, z2.H[7]
+// CHECK-INST: bfmlalb z0.s, z1.h, z2.h[7]
+// CHECK-ENCODING: [0x20,0x48,0xfa,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalt z0.S, z1.H, z2.H[7]
+// CHECK-INST: bfmlalt z0.s, z1.h, z2.h[7]
+// CHECK-ENCODING: [0x20,0x4c,0xfa,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalt z0.S, z1.H, z7.H[7]
+// CHECK-INST: bfmlalt z0.s, z1.h, z7.h[7]
+// CHECK-ENCODING: [0x20,0x4c,0xff,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalb z10.S, z21.H, z14.H
+// CHECK-INST: bfmlalb z10.s, z21.h, z14.h
+// CHECK-ENCODING: [0xaa,0x82,0xee,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalt z14.S, z10.H, z21.H
+// CHECK-INST: bfmlalt z14.s, z10.h, z21.h
+// CHECK-ENCODING: [0x4e,0x85,0xf5,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+bfmlalb z21.s, z14.h, z3.h[2]
+// CHECK-INST: bfmlalb z21.s, z14.h, z3.h[2]
+// CHECK-ENCODING: [0xd5,0x41,0xeb,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+// --------------------------------------------------------------------------//
+// Test compatibility with MOVPRFX instruction.
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalb z0.S, z1.H, z2.H
+// CHECK-INST: bfmlalb z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0x80,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalt z0.S, z1.H, z2.H
+// CHECK-INST: bfmlalt z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0x84,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalb z0.S, z1.H, z2.H[0]
+// CHECK-INST: bfmlalb z0.s, z1.h, z2.h[0]
+// CHECK-ENCODING: [0x20,0x40,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalt z0.S, z1.H, z2.H[0]
+// CHECK-INST: bfmlalt z0.s, z1.h, z2.h[0]
+// CHECK-ENCODING: [0x20,0x44,0xe2,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalb z0.S, z1.H, z2.H[7]
+// CHECK-INST: bfmlalb z0.s, z1.h, z2.h[7]
+// CHECK-ENCODING: [0x20,0x48,0xfa,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalt z0.S, z1.H, z2.H[7]
+// CHECK-INST: bfmlalt z0.s, z1.h, z2.h[7]
+// CHECK-ENCODING: [0x20,0x4c,0xfa,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalt z0.S, z1.H, z7.H[7]
+// CHECK-INST: bfmlalt z0.s, z1.h, z7.h[7]
+// CHECK-ENCODING: [0x20,0x4c,0xff,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z10, z7
+// CHECK-INST: movprfx z10, z7
+// CHECK-ENCODING: [0xea,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalb z10.S, z21.H, z14.H
+// CHECK-INST: bfmlalb z10.s, z21.h, z14.h
+// CHECK-ENCODING: [0xaa,0x82,0xee,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z14, z7
+// CHECK-INST: movprfx z14, z7
+// CHECK-ENCODING: [0xee,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalt z14.S, z10.H, z21.H
+// CHECK-INST: bfmlalt z14.s, z10.h, z21.h
+// CHECK-ENCODING: [0x4e,0x85,0xf5,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+movprfx z21, z7
+// CHECK-INST: movprfx z21, z7
+// CHECK-ENCODING: [0xf5,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmlalb z21.s, z14.h, z3.h[2]
+// CHECK-INST: bfmlalb z21.s, z14.h, z3.h[2]
+// CHECK-ENCODING: [0xd5,0x41,0xeb,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
--- a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
@ -0,0 +1,22 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+sve,bf16  2>&1 < %s| FileCheck %s
+
+bfmmla z0.s, z1.s, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.s, z1.s, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.h, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.h, z1.h, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.s, z1.h, z2.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.s, z1.h, z2.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.s, p0/m, z7.s
+bfmmla z0.s, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: bfmmla z0.s, z1.h, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
--- a/llvm/test/MC/AArch64/SVE/bfmmla.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla.s
@ -0,0 +1,22 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+bf16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+bfmmla z0.S, z1.H, z2.H
+// CHECK-INST: bfmmla z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0xe4,0x62,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
+
+// --------------------------------------------------------------------------//
+// Test compatibility with MOVPRFX instruction.
+
+movprfx z0, z7
+// CHECK-INST: movprfx z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-ERROR: instruction requires: sve
+
+bfmmla z0.S, z1.H, z2.H
+// CHECK-INST: bfmmla z0.s, z1.h, z2.h
+// CHECK-ENCODING: [0x20,0xe4,0x62,0x64]
+// CHECK-ERROR: instruction requires: bf16 sve
--- a/llvm/test/MC/AArch64/armv8.6a-bf16.s
+++ b/llvm/test/MC/AArch64/armv8.6a-bf16.s
@ -0,0 +1,115 @@
+// RUN:     llvm-mc -triple aarch64 -show-encoding  -mattr=+bf16 < %s       | FileCheck %s
+// RUN:     llvm-mc -triple aarch64 -show-encoding  -mattr=+v8.6a < %s      | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding  -mattr=-bf16  < %s 2>&1 | FileCheck %s --check-prefix=NOBF16
+// RUN: not llvm-mc -triple aarch64 -show-encoding  < %s 2>&1 | FileCheck %s --check-prefix=NOBF16
+
+
+bfdot v2.2s, v3.4h, v4.4h
+bfdot v2.4s, v3.8h, v4.8h
+// CHECK: bfdot v2.2s, v3.4h, v4.4h      // encoding: [0x62,0xfc,0x44,0x2e]
+// CHECK: bfdot v2.4s, v3.8h, v4.8h      // encoding: [0x62,0xfc,0x44,0x6e]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot v2.2s, v3.4h, v4.4h
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot v2.4s, v3.8h, v4.8h
+
+bfdot  v2.2s, v3.4h, v4.2h[0]
+bfdot  v2.2s, v3.4h, v4.2h[1]
+bfdot  v2.2s, v3.4h, v4.2h[2]
+bfdot  v2.2s, v3.4h, v4.2h[3]
+// CHECK: bfdot   v2.2s, v3.4h, v4.2h[0]  // encoding: [0x62,0xf0,0x44,0x0f]
+// CHECK: bfdot   v2.2s, v3.4h, v4.2h[1]  // encoding: [0x62,0xf0,0x64,0x0f]
+// CHECK: bfdot   v2.2s, v3.4h, v4.2h[2]  // encoding: [0x62,0xf8,0x44,0x0f]
+// CHECK: bfdot   v2.2s, v3.4h, v4.2h[3]  // encoding: [0x62,0xf8,0x64,0x0f]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot   v2.2s, v3.4h, v4.2h[0]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot   v2.2s, v3.4h, v4.2h[1]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot   v2.2s, v3.4h, v4.2h[2]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot   v2.2s, v3.4h, v4.2h[3]
+
+
+bfdot v2.4s, v3.8h, v4.2h[0]
+bfdot v2.4s, v3.8h, v4.2h[1]
+bfdot v2.4s, v3.8h, v4.2h[2]
+bfdot v2.4s, v3.8h, v4.2h[3]
+// CHECK: bfdot  v2.4s, v3.8h, v4.2h[0]  // encoding: [0x62,0xf0,0x44,0x4f]
+// CHECK: bfdot  v2.4s, v3.8h, v4.2h[1]  // encoding: [0x62,0xf0,0x64,0x4f]
+// CHECK: bfdot  v2.4s, v3.8h, v4.2h[2]  // encoding: [0x62,0xf8,0x44,0x4f]
+// CHECK: bfdot  v2.4s, v3.8h, v4.2h[3]  // encoding: [0x62,0xf8,0x64,0x4f]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot v2.4s, v3.8h, v4.2h[0]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot v2.4s, v3.8h, v4.2h[1]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot v2.4s, v3.8h, v4.2h[2]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfdot v2.4s, v3.8h, v4.2h[3]
+
+
+bfmmla v2.4s, v3.8h, v4.8h
+bfmmla v3.4s, v4.8h, v5.8h
+// CHECK: bfmmla v2.4s, v3.8h, v4.8h   // encoding: [0x62,0xec,0x44,0x6e]
+// CHECK: bfmmla v3.4s, v4.8h, v5.8h   // encoding: [0x83,0xec,0x45,0x6e]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfmmla v2.4s, v3.8h, v4.8h
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfmmla v3.4s, v4.8h, v5.8h
+
+bfcvtn  v5.4h, v5.4s
+bfcvtn2 v5.8h, v5.4s
+// CHECK: bfcvtn  v5.4h, v5.4s           // encoding: [0xa5,0x68,0xa1,0x0e]
+// CHECK: bfcvtn2 v5.8h, v5.4s           // encoding: [0xa5,0x68,0xa1,0x4e]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfcvtn  v5.4h, v5.4s
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfcvtn2 v5.8h, v5.4s
+
+bfcvt  h5, s3
+// CHECK: bfcvt   h5, s3               // encoding: [0x65,0x40,0x63,0x1e]
+// NOBF16: instruction requires: bf16
+// NOBF16-NEXT: bfcvt  h5, s3
+
+bfmlalb V10.4S, V21.8h, V14.8H
+bfmlalt V21.4S, V14.8h, V10.8H
+// CHECK:       bfmlalb	v10.4s, v21.8h, v14.8h  // encoding: [0xaa,0xfe,0xce,0x2e]
+// CHECK-NEXT:  bfmlalt	v21.4s, v14.8h, v10.8h  // encoding: [0xd5,0xfd,0xca,0x6e]
+// NOBF16:      error: instruction requires: bf16
+// NOBF16-NEXT: bfmlalb V10.4S, V21.8h, V14.8H
+// NOBF16-NEXT: ^
+// NOBF16:      instruction requires: bf16
+// NOBF16-NEXT: bfmlalt V21.4S, V14.8h, V10.8H
+// NOBF16-NEXT: ^
+
+bfmlalb V14.4S, V21.8H, V10.H[1]
+bfmlalb V14.4S, V21.8H, V10.H[2]
+bfmlalb V14.4S, V21.8H, V10.H[7]
+bfmlalt V21.4S, V10.8H, V14.H[1]
+bfmlalt V21.4S, V10.8H, V14.H[2]
+bfmlalt V21.4S, V10.8H, V14.H[7]
+// CHECK:      bfmlalb v14.4s, v21.8h, v10.h[1] // encoding: [0xae,0xf2,0xda,0x0f]
+// CHECK-NEXT: bfmlalb v14.4s, v21.8h, v10.h[2] // encoding: [0xae,0xf2,0xea,0x0f]
+// CHECK-NEXT: bfmlalb v14.4s, v21.8h, v10.h[7] // encoding: [0xae,0xfa,0xfa,0x0f]
+// CHECK-NEXT: bfmlalt v21.4s, v10.8h, v14.h[1] // encoding: [0x55,0xf1,0xde,0x4f]
+// CHECK-NEXT: bfmlalt v21.4s, v10.8h, v14.h[2] // encoding: [0x55,0xf1,0xee,0x4f]
+// CHECK-NEXT: bfmlalt v21.4s, v10.8h, v14.h[7] // encoding: [0x55,0xf9,0xfe,0x4f]
+// NOBF16:      error: instruction requires: bf16
+// NOBF16-NEXT: bfmlalb V14.4S, V21.8H, V10.H[1]
+// NOBF16-NEXT: ^
+// NOBF16:      error: instruction requires: bf16
+// NOBF16-NEXT: bfmlalb V14.4S, V21.8H, V10.H[2]
+// NOBF16-NEXT: ^
+// NOBF16:      error: instruction requires: bf16
+// NOBF16-NEXT: bfmlalb V14.4S, V21.8H, V10.H[7]
+// NOBF16-NEXT: ^
+// NOBF16:      instruction requires: bf16
+// NOBF16-NEXT: bfmlalt V21.4S, V10.8H, V14.H[1]
+// NOBF16-NEXT: ^
+// NOBF16:      instruction requires: bf16
+// NOBF16-NEXT: bfmlalt V21.4S, V10.8H, V14.H[2]
+// NOBF16-NEXT: ^
+// NOBF16:      instruction requires: bf16
+// NOBF16-NEXT: bfmlalt V21.4S, V10.8H, V14.H[7]
+// NOBF16-NEXT: ^
--- a/llvm/test/MC/ARM/bfloat16-a32-errors.s
+++ b/llvm/test/MC/ARM/bfloat16-a32-errors.s
@ -0,0 +1,57 @@
+// RUN: not llvm-mc -triple arm -mattr=+bf16,-neon %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=NONEON,ALL
+// RUN: not llvm-mc -triple arm -mattr=-bf16 %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=NOBF16,ALL
+// RUN: not llvm-mc -triple arm %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=NONEON,ALL
+//
+vdot.bf16 d3, d4, d5
+vdot.bf16 q0, q1, q2
+vdot.bf16 d3, d4, d5[1]
+vdot.bf16 q0, q1, d5[1]
+vmmla.bf16 q0, q1, q2
+vcvt.bf16.f32 d1, q3
+vcvtbeq.bf16.f32 s1, s3
+vcvttne.bf16.f32 s1, s3
+// NOBF16: error: instruction requires: BFloat16 floating point extension
+// NOBF16-NEXT: vdot.bf16 d3, d4, d5
+// NOBF16-NEXT: ^
+// NOBF16-NEXT: error: instruction requires: BFloat16 floating point extension
+// NOBF16-NEXT: vdot.bf16 q0, q1, q2
+// NOBF16-NEXT: ^
+// NOBF16-NEXT: error: instruction requires: BFloat16 floating point extension
+// NOBF16-NEXT: vdot.bf16 d3, d4, d5[1]
+// NOBF16-NEXT: ^
+// NOBF16-NEXT: error: instruction requires: BFloat16 floating point extension
+// NOBF16-NEXT: vdot.bf16 q0, q1, d5[1]
+// NOBF16-NEXT: ^
+// NOBF16-NEXT: error: instruction requires: BFloat16 floating point extension
+// NOBF16-NEXT: vmmla.bf16 q0, q1, q2
+// NOBF16-NEXT: ^
+// NOBF16-NEXT: error: instruction requires: BFloat16 floating point extension
+// NOBF16-NEXT: vcvt.bf16.f32 d1, q3
+// NOBF16-NEXT: ^
+
+// NONEON: error: instruction requires: BFloat16 floating point extension NEON
+// NONEON-NEXT: vdot.bf16 d3, d4, d5
+// NONEON-NEXT: ^
+// NONEON-NEXT: error: instruction requires: BFloat16 floating point extension NEON
+// NONEON-NEXT: vdot.bf16 q0, q1, q2
+// NONEON-NEXT: ^
+// NONEON-NEXT: error: instruction requires: BFloat16 floating point extension NEON
+// NONEON-NEXT: vdot.bf16 d3, d4, d5[1]
+// NONEON-NEXT: ^
+// NONEON-NEXT: error: instruction requires: BFloat16 floating point extension NEON
+// NONEON-NEXT: vdot.bf16 q0, q1, d5[1]
+// NONEON-NEXT: ^
+// NONEON-NEXT: error: instruction requires: BFloat16 floating point extension NEON
+// NONEON-NEXT: vmmla.bf16 q0, q1, q2
+// NONEON-NEXT: ^
+// NONEON-NEXT: error: instruction requires: BFloat16 floating point extension NEON
+// NONEON-NEXT: vcvt.bf16.f32 d1, q3
+// NONEON-NEXT: ^
+
+
+// ALL-NEXT: error: instruction requires: BFloat16 floating point extension
+// ALL-NEXT: vcvtbeq.bf16.f32 s1, s3
+// ALL-NEXT: ^
+// ALL-NEXT: error: instruction requires: BFloat16 floating point extension
+// ALL-NEXT: vcvttne.bf16.f32 s1, s3
+// ALL-NEXT: ^
--- a/llvm/test/MC/ARM/bfloat16-a32-errors2.s
+++ b/llvm/test/MC/ARM/bfloat16-a32-errors2.s
@ -0,0 +1,134 @@
+// RUN: not llvm-mc -o - -triple arm -mattr=+v8.6a -show-encoding %s 2>&1 | FileCheck %s
+vfmat.bf16 d0, d0, d0
+vfmat.bf16 d0, d0, q0
+vfmat.bf16 d0, q0, d0
+vfmat.bf16 q0, d0, d0
+vfmat.bf16 q0, q0, d0
+vfmat.bf16 q0, d0, q0
+vfmat.bf16 d0, q0, q0
+vfmat.bf16 q0, q0, q0[3]
+vfmat.bf16 q0, q0, q0[3]
+vfmat.bf16 q0, d0, d0[0]
+vfmat.bf16 d0, q0, d0[0]
+vfmat.bf16 q0, d0, d0[9]
+
+vfmab.bf16 d0, d0, d0
+vfmab.bf16 d0, d0, q0
+vfmab.bf16 d0, q0, d0
+vfmab.bf16 q0, d0, d0
+vfmab.bf16 q0, q0, d0
+vfmab.bf16 q0, d0, q0
+vfmab.bf16 d0, q0, q0
+vfmab.bf16 q0, q0, q0[3]
+vfmab.bf16 q0, q0, q0[3]
+vfmab.bf16 q0, d0, d0[0]
+vfmab.bf16 d0, q0, d0[0]
+vfmab.bf16 q0, d0, d0[9]
+
+//CHECK:error: invalid instruction
+//CHECK-NEXT:vfmat.bf16 d0, d0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmat.bf16 d0, d0, q0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmat.bf16 d0, q0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmat.bf16 q0, d0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction, any one of the following would fix this:
+//CHECK-NEXT:vfmat.bf16 q0, q0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:note: too few operands for instruction
+//CHECK-NEXT:vfmat.bf16 q0, q0, d0
+//CHECK-NEXT:                      ^
+//CHECK-NEXT:note: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmat.bf16 q0, q0, d0
+//CHECK-NEXT:                    ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmat.bf16 q0, d0, q0
+//CHECK-NEXT:                ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmat.bf16 d0, q0, q0
+//CHECK-NEXT:            ^
+//CHECK-NEXT:error: invalid instruction, any one of the following would fix this:
+//CHECK-NEXT:vfmat.bf16 q0, q0, q0[3]
+//CHECK-NEXT:^
+//CHECK-NEXT:note: operand must be a register in range [d0, d7]
+//CHECK-NEXT:vfmat.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                    ^
+//CHECK-NEXT:note: too many operands for instruction
+//CHECK-NEXT:vfmat.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                      ^
+//CHECK-NEXT:error: invalid instruction, any one of the following would fix this:
+//CHECK-NEXT:vfmat.bf16 q0, q0, q0[3]
+//CHECK-NEXT:^
+//CHECK-NEXT:note: operand must be a register in range [d0, d7]
+//CHECK-NEXT:vfmat.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                    ^
+//CHECK-NEXT:note: too many operands for instruction
+//CHECK-NEXT:vfmat.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                      ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmat.bf16 q0, d0, d0[0]
+//CHECK-NEXT:                ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmat.bf16 d0, q0, d0[0]
+//CHECK-NEXT:            ^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmat.bf16 q0, d0, d0[9]
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmab.bf16 d0, d0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmab.bf16 d0, d0, q0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmab.bf16 d0, q0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmab.bf16 q0, d0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:error: invalid instruction, any one of the following would fix this:
+//CHECK-NEXT:vfmab.bf16 q0, q0, d0
+//CHECK-NEXT:^
+//CHECK-NEXT:note: too few operands for instruction
+//CHECK-NEXT:vfmab.bf16 q0, q0, d0
+//CHECK-NEXT:                      ^
+//CHECK-NEXT:note: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmab.bf16 q0, q0, d0
+//CHECK-NEXT:                    ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmab.bf16 q0, d0, q0
+//CHECK-NEXT:                ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmab.bf16 d0, q0, q0
+//CHECK-NEXT:            ^
+//CHECK-NEXT:error: invalid instruction, any one of the following would fix this:
+//CHECK-NEXT:vfmab.bf16 q0, q0, q0[3]
+//CHECK-NEXT:^
+//CHECK-NEXT:note: operand must be a register in range [d0, d7]
+//CHECK-NEXT:vfmab.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                    ^
+//CHECK-NEXT:note: too many operands for instruction
+//CHECK-NEXT:vfmab.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                      ^
+//CHECK-NEXT:error: invalid instruction, any one of the following would fix this:
+//CHECK-NEXT:vfmab.bf16 q0, q0, q0[3]
+//CHECK-NEXT:^
+//CHECK-NEXT:note: operand must be a register in range [d0, d7]
+//CHECK-NEXT:vfmab.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                    ^
+//CHECK-NEXT:note: too many operands for instruction
+//CHECK-NEXT:vfmab.bf16 q0, q0, q0[3]
+//CHECK-NEXT:                      ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmab.bf16 q0, d0, d0[0]
+//CHECK-NEXT:                ^
+//CHECK-NEXT:error: operand must be a register in range [q0, q15]
+//CHECK-NEXT:vfmab.bf16 d0, q0, d0[0]
+//CHECK-NEXT:            ^
+//CHECK-NEXT:error: invalid instruction
+//CHECK-NEXT:vfmab.bf16 q0, d0, d0[9]
--- a/llvm/test/MC/ARM/bfloat16-a32.s
+++ b/llvm/test/MC/ARM/bfloat16-a32.s
@ -0,0 +1,55 @@
+// RUN: llvm-mc -triple arm -mattr=+bf16,+neon -show-encoding < %s | FileCheck %s  --check-prefix=CHECK
+// RUN: llvm-mc -triple arm -mattr=+v8.6a -show-encoding < %s | FileCheck %s  --check-prefix=CHECK
+
+vdot.bf16     d3, d4, d5
+// CHECK:     vdot.bf16  d3, d4, d5     @ encoding: [0x05,0x3d,0x04,0xfc]
+vdot.bf16    q0, q1, q2
+// CHECK-NEXT:     vdot.bf16 q0, q1, q2     @ encoding: [0x44,0x0d,0x02,0xfc]
+vdot.bf16     d3, d4, d5[1]
+// CHECK-NEXT:     vdot.bf16  d3, d4, d5[1] @ encoding: [0x25,0x3d,0x04,0xfe]
+vdot.bf16    q0, q1, d5[1]
+// CHECK-NEXT:     vdot.bf16  q0, q1, d5[1] @ encoding: [0x65,0x0d,0x02,0xfe]
+vmmla.bf16  q0, q1, q2
+// CHECK-NEXT:     vmmla.bf16 q0, q1, q2   @ encoding: [0x44,0x0c,0x02,0xfc]
+vcvt.bf16.f32 d1, q3
+// CHECK-NEXT:     vcvt.bf16.f32   d1, q3    @ encoding: [0x46,0x16,0xb6,0xf3]
+vcvtbeq.bf16.f32  s1, s3
+// CHECK-NEXT: vcvtbeq.bf16.f32 s1, s3       @ encoding: [0x61,0x09,0xf3,0x0e]
+vcvttne.bf16.f32 s1, s3
+// CHECK-NEXT: vcvttne.bf16.f32 s1, s3       @ encoding: [0xe1,0x09,0xf3,0x1e]
+vfmat.bf16 q0, q0, q0
+//CHECK-NEXT: vfmat.bf16      q0, q0, q0      @ encoding: [0x50,0x08,0x30,0xfc]
+vfmat.bf16 q0, q0, q15
+//CHECK-NEXT: vfmat.bf16      q0, q0, q15     @ encoding: [0x7e,0x08,0x30,0xfc]
+vfmat.bf16 q0, q15, q0
+//CHECK-NEXT: vfmat.bf16      q0, q15, q0     @ encoding: [0xd0,0x08,0x3e,0xfc]
+vfmat.bf16 q0, q15, q15
+//CHECK-NEXT: vfmat.bf16      q0, q15, q15     @ encoding: [0xfe,0x08,0x3e,0xfc]
+vfmat.bf16 q7, q0, q0
+//CHECK-NEXT: vfmat.bf16      q7, q0, q0      @ encoding: [0x50,0xe8,0x30,0xfc]
+vfmat.bf16 q8, q0, q0
+//CHECK-NEXT: vfmat.bf16      q8, q0, q0      @ encoding: [0x50,0x08,0x70,0xfc]
+vfmab.bf16 q0, q0, q0
+//CHECK-NEXT: vfmab.bf16      q0, q0, q0      @ encoding: [0x10,0x08,0x30,0xfc]
+vfmab.bf16 q0, q0, q15
+//CHECK-NEXT: vfmab.bf16      q0, q0, q15     @ encoding: [0x3e,0x08,0x30,0xfc]
+vfmab.bf16 q0, q15, q0
+//CHECK-NEXT: vfmab.bf16      q0, q15, q0     @ encoding: [0x90,0x08,0x3e,0xfc]
+vfmab.bf16 q0, q15, q15
+//CHECK-NEXT: vfmab.bf16      q0, q15, q15    @ encoding: [0xbe,0x08,0x3e,0xfc]
+vfmab.bf16 q7, q0, q0
+//CHECK-NEXT: vfmab.bf16      q7, q0, q0      @ encoding: [0x10,0xe8,0x30,0xfc]
+vfmab.bf16 q8, q0, q0
+//CHECK-NEXT: vfmab.bf16      q8, q0, q0      @ encoding: [0x10,0x08,0x70,0xfc]
+vfmat.bf16 q0, q0, d0[0]
+//CHECK-NEXT:  vfmat.bf16   q0, q0, d0[0]   @ encoding: [0x50,0x08,0x30,0xfe]
+vfmat.bf16 q0, q0, d0[3]
+//CHECK-NEXT:  vfmat.bf16   q0, q0, d0[3]   @ encoding: [0x78,0x08,0x30,0xfe]
+vfmat.bf16 q0, q0, d7[0]
+//CHECK-NEXT:  vfmat.bf16   q0, q0, d7[0]   @ encoding: [0x57,0x08,0x30,0xfe]
+vfmab.bf16 q0, q0, d0[0]
+//CHECK-NEXT:  vfmab.bf16   q0, q0, d0[0]   @ encoding: [0x10,0x08,0x30,0xfe]
+vfmab.bf16 q0, q0, d0[3]
+//CHECK-NEXT:  vfmab.bf16   q0, q0, d0[3]   @ encoding: [0x38,0x08,0x30,0xfe]
+vfmab.bf16 q0, q0, d7[0]
+//CHECK-NEXT:  vfmab.bf16   q0, q0, d7[0]   @ encoding: [0x17,0x08,0x30,0xfe]
--- a/llvm/test/MC/ARM/bfloat16-t32-errors.s
+++ b/llvm/test/MC/ARM/bfloat16-t32-errors.s
@ -0,0 +1,32 @@
+// RUN: not llvm-mc -triple thumbv8 -mattr=-bf16 < %s 2>&1 | FileCheck %s
+
+vdot.bf16     d3, d4, d5
+// CHECK: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vdot.bf16     d3, d4, d5
+
+vdot.bf16    q0, q1, q2
+// CHECK: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vdot.bf16    q0, q1, q2
+
+vdot.bf16    d3, d4, d5[1]
+// CHECK: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vdot.bf16    d3, d4, d5[1]
+
+vdot.bf16    q0, q1, d5[1]
+// CHECK: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vdot.bf16    q0, q1, d5[1]
+
+vmmla.bf16  q0, q1, q2
+// CHECK: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vmmla.bf16  q0, q1, q2
+
+vcvt.bf16.f32 d1, q3
+// CHECK: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vcvt.bf16.f32 d1, q3
+
+vcvtbeq.bf16.f32  s1, s3
+// CHECK: note: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vcvtbeq.bf16.f32  s1, s3
+vcvttne.bf16.f32 s1, s3
+// CHECK: note: instruction requires: BFloat16 floating point extension
+// CHECK-NEXT: vcvttne.bf16.f32 s1, s3
--- a/llvm/test/MC/ARM/bfloat16-t32.s
+++ b/llvm/test/MC/ARM/bfloat16-t32.s
@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple thumbv8 -mattr=+bf16,+neon -show-encoding < %s | FileCheck %s  --check-prefix=CHECK
+// RUN: llvm-mc -triple thumbv8 -mattr=+v8.6a -show-encoding < %s | FileCheck %s  --check-prefix=CHECK
+
+vcvt.bf16.f32 d1, q3
+// CHECK:     vcvt.bf16.f32   d1, q3    @ encoding: [0xb6,0xff,0x46,0x16]
+
+it eq
+vcvtbeq.bf16.f32  s1, s3
+// CHECK: it eq                         @ encoding: [0x08,0xbf]
+// CHECK-NEXT: vcvtbeq.bf16.f32 s1, s3  @ encoding:  [0xf3,0xee,0x61,0x09]
+
+it ne
+vcvttne.bf16.f32 s1, s3
+// CHECK: it ne                         @ encoding: [0x18,0xbf]
+// CHECK: vcvttne.bf16.f32 s1, s3       @ encoding: [0xf3,0xee,0xe1,0x09]
--- a/llvm/test/MC/Disassembler/AArch64/armv8.6a-bf16.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.6a-bf16.txt
@ -0,0 +1,74 @@
+# RUN:     llvm-mc -triple=aarch64  -mattr=+bf16  -disassemble < %s      | FileCheck %s
+# RUN:     llvm-mc -triple=aarch64  -mattr=+v8.6a -disassemble < %s      | FileCheck %s
+# RUN: not llvm-mc -triple=aarch64  -mattr=-bf16  -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOBF16
+# RUN: not llvm-mc -triple=aarch64                -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOBF16
+
+
+[0x62,0xfc,0x44,0x2e]
+[0x62,0xfc,0x44,0x6e]
+# CHECK: bfdot   v2.2s, v3.4h, v4.4h
+# CHECK: bfdot   v2.4s, v3.8h, v4.8h
+# NOBF16: warning: invalid instruction encoding
+# NOBF16-NEXT: [0x62,0xfc,0x44,0x2e]
+# NOBF16: warning: invalid instruction encoding
+# NOBF16-NEXT: [0x62,0xfc,0x44,0x6e]
+
+[0x62,0xf0,0x44,0x4f]
+[0x62,0xf0,0x64,0x4f]
+[0x62,0xf8,0x44,0x4f]
+[0x62,0xf8,0x64,0x4f]
+# CHECK: bfdot  v2.4s, v3.8h, v4.2h[0]
+# CHECK: bfdot  v2.4s, v3.8h, v4.2h[1]
+# CHECK: bfdot  v2.4s, v3.8h, v4.2h[2]
+# CHECK: bfdot  v2.4s, v3.8h, v4.2h[3]
+# NOBF16: warning: invalid instruction encoding
+# NOBF-NEXT: [0x62,0xf0,0x44,0x4f]
+# NOBF16: warning: invalid instruction encoding
+# NOBF6-NEXT: [0x62,0xf0,0x64,0x4f]
+# NOBF16: warning: invalid instruction encoding
+# NOBF6-NEXT: [0x62,0xf8,0x44,0x4f]
+# NOBF16: warning: invalid instruction encoding
+# NOBF6-NEXT: [0x62,0xf8,0x64,0x4f]
+
+
+[0x62,0xf0,0x44,0x0f]
+[0x62,0xf0,0x64,0x0f]
+[0x62,0xf8,0x44,0x0f]
+[0x62,0xf8,0x64,0x0f]
+# CHECK: bfdot  v2.2s, v3.4h, v4.2h[0]
+# CHECK: bfdot  v2.2s, v3.4h, v4.2h[1]
+# CHECK: bfdot  v2.2s, v3.4h, v4.2h[2]
+# CHECK: bfdot  v2.2s, v3.4h, v4.2h[3]
+# NOBF16: warning: invalid instruction encoding
+# NOBF-NEXT: [0x62,0xf0,0x44,0x0f]
+# NOBF16: warning: invalid instruction encoding
+# NOBF6-NEXT: [0x62,0xf0,0x64,0x0f]
+# NOBF16: warning: invalid instruction encoding
+# NOBF6-NEXT: [0x62,0xf8,0x44,0x0f]
+# NOBF16: warning: invalid instruction encoding
+# NOBF6-NEXT: [0x62,0xf8,0x64,0x0f]
+
+
+[0x62,0xec,0x44,0x6e]
+[0x83,0xec,0x45,0x6e]
+# CHECK: bfmmla  v2.4s, v3.8h, v4.8h
+# CHECK: bfmmla  v3.4s, v4.8h, v5.8h
+# NOBF16: warning: invalid instruction encoding
+NOBF16-NEXT: [0x62,0xec,0x44,0x6e]
+# NOBF16: warning: invalid instruction encoding
+# NOBF16-NEXT: [0x83,0xec,0x45,0x6e]
+
+
+[0xa5,0x68,0xa1,0x0e]
+[0xa5,0x68,0xa1,0x4e]
+# CHECK: bfcvtn   v5.4h, v5.4s
+# CHECK: bfcvtn2  v5.8h, v5.4s
+# NOBF16: warning: invalid instruction encoding
+# NOBF16-NEXT: [0xa5,0x68,0xa1,0x0e]
+# NOBF16: warning: invalid instruction encoding
+# NOBF16-NEXT: [0xa5,0x68,0xa1,0x4e]
+
+[0x65, 0x40, 0x63, 0x1e]
+# CHECK: bfcvt  h5, s3
+# NOBF16: warning: invalid instruction encoding
+# NOBF16-NEXT: [0x65, 0x40, 0x63, 0x1e]
--- a/llvm/test/MC/Disassembler/ARM/bfloat16-a32_1.txt
+++ b/llvm/test/MC/Disassembler/ARM/bfloat16-a32_1.txt
@ -0,0 +1,102 @@
+# RUN: llvm-mc -triple arm-none-linux-gnu -mattr=+bf16,+neon --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm-none-linux-gnu -mattr=+v8.6a --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm-none-linux-gnu -mattr=-bf16 --disassemble < %s 2>&1  | FileCheck %s --check-prefix=NOBF16
+# RUN: llvm-mc -triple arm-none-linux-gnu --disassemble < %s 2>&1  | FileCheck %s --check-prefix=NOBF16
+#
+# Tests BFloat16 instruction decodings.
+# Without BFloat16 enabled, some of these get disassembled to coprocessor instructions.
+[0x25,0x3d,0x04,0xfe]
+# CHECK: vdot.bf16 d3, d4, d5[1]
+# NOBF16: cdp2  p13, #0, c3, c4, c5, #1
+#
+[0x65,0x0d,0x02,0xfe]
+# CHECK-NEXT: vdot.bf16  q0, q1, d5[1]
+# NOBF16-NEXT: cdp2  p13, #0, c0, c2, c5, #3
+#
+[0x61,0x09,0xf3,0x0e]
+# CHECK-NEXT: vcvtbeq.bf16.f32 s1, s3
+# NOBF16-NEXT: cdpeq p9, #15, c0, c3, c1, #3
+#
+[0xe1,0x09,0xf3,0x1e]
+# CHECK-NEXT: vcvttne.bf16.f32 s1, s3
+# NOBF16-NEXT: cdpne p9, #15, c0, c3, c1, #7
+#
+[0x50,0x08,0x30,0xfc]
+# CHECK-NEXT: vfmat.bf16      q0, q0, q0
+# NOBF16-NEXT: ldc2    p8, c0, [r0], #-320
+#
+[0x7e,0x08,0x30,0xfc]
+# CHECK-NEXT: vfmat.bf16      q0, q0, q15
+# NOBF16-NEXT: ldc2    p8, c0, [r0], #-504
+#
+[0xd0,0x08,0x3e,0xfc]
+# CHECK-NEXT: vfmat.bf16      q0, q15, q0
+# NOBF16-NEXT: ldc2    p8, c0, [lr], #-832
+#
+[0xfe,0x08,0x3e,0xfc]
+# CHECK-NEXT: vfmat.bf16      q0, q15, q15
+# NOBF16-NEXT: ldc2    p8, c0, [lr], #-1016
+#
+[0xd0,0x08,0x30,0xfc]
+# CHECK-NEXT: vfmat.bf16      q0, q8, q0
+# NOBF16-NEXT: ldc2    p8, c0, [r0], #-832
+#
+[0x50,0xe8,0x30,0xfc]
+# CHECK-NEXT: vfmat.bf16      q7, q0, q0
+# NOBF16-NEXT: ldc2    p8, c14, [r0], #-320
+#
+[0x50,0x08,0x70,0xfc]
+# CHECK-NEXT: vfmat.bf16      q8, q0, q0
+# NOBF16-NEXT: ldc2l   p8, c0, [r0], #-320
+#
+[0x10,0x08,0x30,0xfc]
+# CHECK-NEXT: vfmab.bf16      q0, q0, q0
+# NOBF16-NEXT: ldc2    p8, c0, [r0], #-64
+#
+[0x3e,0x08,0x30,0xfc]
+# CHECK-NEXT: vfmab.bf16      q0, q0, q15
+# NOBF16-NEXT: ldc2    p8, c0, [r0], #-248
+#
+[0x90,0x08,0x3e,0xfc]
+# CHECK-NEXT: vfmab.bf16      q0, q15, q0
+# NOBF16-NEXT: ldc2    p8, c0, [lr], #-576
+#
+[0xbe,0x08,0x3e,0xfc]
+# CHECK-NEXT: vfmab.bf16      q0, q15, q15
+# NOBF16-NEXT: ldc2    p8, c0, [lr], #-760
+#
+[0x90,0x08,0x30,0xfc]
+# CHECK-NEXT: vfmab.bf16      q0, q8, q0
+# NOBF16-NEXT: ldc2    p8, c0, [r0], #-576
+#
+[0x10,0xe8,0x30,0xfc]
+# CHECK-NEXT: vfmab.bf16      q7, q0, q0
+# NOBF16-NEXT: ldc2    p8, c14, [r0], #-64
+#
+[0x10,0x08,0x70,0xfc]
+# CHECK-NEXT: vfmab.bf16      q8, q0, q0
+# NOBF16-NEXT: ldc2l   p8, c0, [r0], #-64
+#
+[0x50,0x08,0x30,0xfe]
+# CHECK-NEXT: vfmat.bf16 q0, q0, d0[0]
+# NOBF16-NEXT: mrc2  p8, #1, r0, c0, c0, #2
+#
+[0x78,0x08,0x30,0xfe]
+# CHECK-NEXT: vfmat.bf16 q0, q0, d0[3]
+# NOBF16-NEXT: mrc2  p8, #1, r0, c0, c8, #3
+[0x57,0x08,0x30,0xfe]
+#
+# CHECK-NEXT: vfmat.bf16 q0, q0, d7[0]
+# NOBF16-NEXT: mrc2  p8, #1, r0, c0, c7, #2
+[0x10,0x08,0x30,0xfe]
+#
+# CHECK-NEXT: vfmab.bf16 q0, q0, d0[0]
+# NOBF16-NEXT: mrc2  p8, #1, r0, c0, c0, #0
+[0x38,0x08,0x30,0xfe]
+#
+# CHECK-NEXT: vfmab.bf16 q0, q0, d0[3]
+# NOBF16-NEXT: mrc2  p8, #1, r0, c0, c8, #1
+#
+[0x17,0x08,0x30,0xfe]
+# CHECK-NEXT: vfmab.bf16 q0, q0, d7[0]
+# NOBF16-NEXT: mrc2  p8, #1, r0, c0, c7, #0
--- a/llvm/test/MC/Disassembler/ARM/bfloat16-a32_2.txt
+++ b/llvm/test/MC/Disassembler/ARM/bfloat16-a32_2.txt
@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple arm-none-linux-gnu -mattr=+bf16,+neon --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm-none-linux-gnu -mattr=+v8.6a --disassemble < %s | FileCheck %s
+# RUN: not llvm-mc -triple arm-none-linux-gnu -mattr=-bf16 --disassemble < %s 2>&1  | FileCheck %s --check-prefix=CHECK-NOBF16
+# RUN: not llvm-mc -triple arm-none-linux-gnu --disassemble < %s 2>&1  | FileCheck %s --check-prefix=CHECK-NOBF16
+
+[0x05,0x3d,0x04,0xfc]
+# CHECK: vdot.bf16  d3, d4, d5
+# CHECK-NOBF16: warning: invalid instruction encoding
+
+[0x44,0x0d,0x02,0xfc]
+# CHECK: vdot.bf16 q0, q1, q2
+# CHECK-NOBF16: warning: invalid instruction encoding
+
+[0x44,0x0c,0x02,0xfc]
+# CHECK: vmmla.bf16   q0, q1, q2
+# CHECK-NOBF16: warning: invalid instruction encoding
+
+[0x46,0x16,0xb6,0xf3]
+# CHECK: vcvt.bf16.f32   d1, q3
+# CHECK-ERROR:  warning: invalid instruction encoding
--- a/llvm/test/MC/Disassembler/ARM/bfloat16-t32.txt
+++ b/llvm/test/MC/Disassembler/ARM/bfloat16-t32.txt
@ -0,0 +1,25 @@
+# RUN: llvm-mc -triple thumbv8-none-linux-gnu -mattr=+bf16,+neon --disassemble  < %s | FileCheck %s
+# RUN: llvm-mc -triple thumbv8-none-linux-gnu -mattr=+v8.6a --disassemble  < %s | FileCheck %s
+
+[0x04,0xfc,0x05,0x3d]
+[0x02,0xfc,0x44,0x0d]
+# CHECK:     vdot.bf16  d3, d4, d5
+# CHECK:     vdot.bf16 q0, q1, q2
+
+[0x04,0xfe,0x25,0x3d]
+# CHECK: vdot.bf16    d3, d4, d5[1]
+
+[0x02,0xfe,0x65,0x0d]
+# CHECK: vdot.bf16  q0, q1, d5[1]
+
+[0x02,0xfc,0x44,0x0c]
+# CHECK:     vmmla.bf16 q0, q1, q2
+
+[0xb6,0xff,0x46,0x16]
+# CHECK:     vcvt.bf16.f32   d1, q3
+
+[0xf3,0xee,0x61,0x09]
+# CHECK: vcvtb.bf16.f32 s1, s3
+
+[0xf3,0xee,0xe1,0x09]
+# CHECK: vcvtt.bf16.f32 s1, s3
--- a/llvm/test/MC/Disassembler/ARM/bfloat16-t32_errors.txt
+++ b/llvm/test/MC/Disassembler/ARM/bfloat16-t32_errors.txt
@ -0,0 +1,40 @@
+# RUN: not llvm-mc -triple thumbv8-none-linux-gnu -mattr=-bf16 --disassemble < %s 2>&1  | FileCheck %s
+# RUN: not llvm-mc -triple thumbv8-none-linux-gnu --disassemble < %s 2>&1  | FileCheck %s
+
+[0x04,0xfc,0x05,0x3d]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0x04,0xfc,0x05,0x3d]
+
+[0x02,0xfc,0x44,0x0d]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0x02,0xfc,0x44,0x0d]
+
+
+[0x04,0xfe,0x25,0x3d]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0x04,0xfe,0x25,0x3d]
+
+
+[0x02,0xfe,0x65,0x0d]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0x02,0xfe,0x65,0x0d]
+
+
+[0x02,0xfc,0x44,0x0c]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0x02,0xfc,0x44,0x0c]
+
+
+[0xb6,0xff,0x46,0x16]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0xb6,0xff,0x46,0x16]
+
+
+[0xf3,0xee,0x61,0x09]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0xf3,0xee,0x61,0x09]
+
+
+[0xf3,0xee,0xe1,0x09]
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0xf3,0xee,0xe1,0x09]
--- a/llvm/unittests/Support/TargetParserTest.cpp
+++ b/llvm/unittests/Support/TargetParserTest.cpp
@ -26,9 +26,9 @@ const char *ARMArch[] = {
    "armv7e-m",    "armv7em",      "armv8-a",     "armv8",        "armv8a",
    "armv8l",      "armv8.1-a",    "armv8.1a",    "armv8.2-a",    "armv8.2a",
    "armv8.3-a",   "armv8.3a",     "armv8.4-a",   "armv8.4a",     "armv8.5-a",
-    "armv8.5a",     "armv8-r",     "armv8r",      "armv8-m.base", "armv8m.base",
-    "armv8-m.main", "armv8m.main", "iwmmxt",      "iwmmxt2",      "xscale",
-    "armv8.1-m.main",
+    "armv8.5a",     "armv8.6-a",   "armv8.6a",     "armv8-r",     "armv8r",
+    "armv8-m.base", "armv8m.base", "armv8-m.main", "armv8m.main", "iwmmxt",
+    "iwmmxt2",      "xscale",      "armv8.1-m.main",
 };

 bool testARMCPU(StringRef CPUName, StringRef ExpectedArch,
@ -410,6 +410,9 @@ TEST(TargetParserTest, testARMArch) {
  EXPECT_TRUE(
      testARMArch("armv8.5-a", "generic", "v8.5a",
                          ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(
+      testARMArch("armv8.6-a", "generic", "v8.6a",
+                          ARMBuildAttrs::CPUArch::v8_A));
  EXPECT_TRUE(
      testARMArch("armv8-r", "cortex-r52", "v8r",
                          ARMBuildAttrs::CPUArch::v8_R));
@ -678,7 +681,7 @@ TEST(TargetParserTest, ARMparseArchEndianAndISA) {
      "v7",   "v7a",    "v7ve",  "v7hl",   "v7l",   "v7-r",   "v7r",   "v7-m",
      "v7m",  "v7k",    "v7s",   "v7e-m",  "v7em",  "v8-a",   "v8",    "v8a",
      "v8l",  "v8.1-a", "v8.1a", "v8.2-a", "v8.2a", "v8.3-a", "v8.3a", "v8.4-a",
-      "v8.4a", "v8.5-a","v8.5a", "v8-r",   "v8m.base", "v8m.main", "v8.1m.main"
+      "v8.4a", "v8.5-a","v8.5a", "v8.6-a", "v8.6a", "v8-r",   "v8m.base", "v8m.main", "v8.1m.main"
  };

  for (unsigned i = 0; i < array_lengthof(Arch); i++) {
@ -743,6 +746,7 @@ TEST(TargetParserTest, ARMparseArchProfile) {
    case ARM::ArchKind::ARMV8_3A:
    case ARM::ArchKind::ARMV8_4A:
    case ARM::ArchKind::ARMV8_5A:
+    case ARM::ArchKind::ARMV8_6A:
      EXPECT_EQ(ARM::ProfileKind::A, ARM::parseArchProfile(ARMArch[i]));
      break;
    default:
@ -1008,6 +1012,8 @@ TEST(TargetParserTest, testAArch64Arch) {
                              ARMBuildAttrs::CPUArch::v8_A));
  EXPECT_TRUE(testAArch64Arch("armv8.5-a", "generic", "v8.5a",
                              ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(testAArch64Arch("armv8.6-a", "generic", "v8.6a",
+                              ARMBuildAttrs::CPUArch::v8_A));
 }

 bool testAArch64Extension(StringRef CPUName, AArch64::ArchKind AK,