From d22e66171251cd3dd07507912189aa814a419678 Mon Sep 17 00:00:00 2001
From: Mikhail Maltsev <mikhail.maltsev@arm.com>
Date: Fri, 20 Mar 2020 14:01:53 +0000
Subject: [PATCH] [ARM,CDE] Implement CDE S and D-register intrinsics

Summary:
This patch implements the following ACLE intrinsics:

  uint32_t __arm_vcx1_u32(int coproc, uint32_t imm);
  uint32_t __arm_vcx1a_u32(int coproc, uint32_t acc, uint32_t imm);
  uint32_t __arm_vcx2_u32(int coproc, uint32_t n, uint32_t imm);
  uint32_t __arm_vcx2a_u32(int coproc, uint32_t acc, uint32_t n, uint32_t imm);
  uint32_t __arm_vcx3_u32(int coproc, uint32_t n, uint32_t m, uint32_t imm);
  uint32_t __arm_vcx3a_u32(int coproc, uint32_t acc, uint32_t n, uint32_t m, uint32_t imm);

  uint64_t __arm_vcx1d_u64(int coproc, uint32_t imm);
  uint64_t __arm_vcx1da_u64(int coproc, uint64_t acc, uint32_t imm);
  uint64_t __arm_vcx2d_u64(int coproc, uint64_t m, uint32_t imm);
  uint64_t __arm_vcx2da_u64(int coproc, uint64_t acc, uint64_t m, uint32_t imm);
  uint64_t __arm_vcx3d_u64(int coproc, uint64_t n, uint64_t m, uint32_t imm);
  uint64_t __arm_vcx3da_u64(int coproc, uint64_t acc, uint64_t n, uint64_t m, uint32_t imm);

Since the semantics of CDE instructions is opaque to the compiler, the
ACLE intrinsics require dedicated LLVM IR intrinsics. The 64-bit and
32-bit variants share the same IR intrinsic.

Reviewers: simon_tatham, MarkMurrayARM, ostannard, dmgreen

Reviewed By: MarkMurrayARM

Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D76298
---
 clang/include/clang/Basic/arm_cde.td  |  37 +++++
 clang/test/CodeGen/arm-cde-vfp.c      | 145 +++++++++++++++++++
 clang/test/Sema/arm-cde-immediates.c  |  40 ++++++
 clang/utils/TableGen/MveEmitter.cpp   |   3 +
 llvm/include/llvm/IR/IntrinsicsARM.td |  16 +++
 llvm/lib/Target/ARM/ARMInstrCDE.td    |  39 +++++
 llvm/test/CodeGen/Thumb2/cde-vfp.ll   | 198 ++++++++++++++++++++++++++
 7 files changed, 478 insertions(+)
 create mode 100644 clang/test/CodeGen/arm-cde-vfp.c
 create mode 100644 llvm/test/CodeGen/Thumb2/cde-vfp.ll
diff --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td
index 139007d387a0..9cd0af8987c9 100644
--- a/clang/include/clang/Basic/arm_cde.td
+++ b/clang/include/clang/Basic/arm_cde.td
@@ -13,6 +13,15 @@
 
 include "arm_mve_defs.td"
 
+// f64 is not defined in arm_mve_defs.td because MVE instructions only work with
+// f16 and f32
+def f64: PrimitiveType<"f", 64>;
+
+// Float<t> expects t to be a scalar type, and expands to the floating-point
+// type of the same width.
+class Float<Type t>: ComplexType<(CTO_CopyKind t, f32)>;
+def FScalar: Float<Scalar>;
+
 // ACLE CDE intrinsic
 class CDEIntrinsic<Type ret, dag args, dag codegen>
   : Intrinsic<ret, args, codegen> {
@@ -70,3 +79,31 @@ multiclass CDE_CX_m<dag argsImm, dag argsReg, dag cgArgs> {
 defm cx1 : CDE_CX_m<(args imm_13b:$imm), (args), (?)>;
 defm cx2 : CDE_CX_m<(args imm_9b:$imm), (args u32:$n), (? $n)>;
 defm cx3 : CDE_CX_m<(args imm_6b:$imm), (args u32:$n, u32:$m), (? $n, $m)>;
+
+// VCX* instructions operating on VFP registers
+multiclass CDE_VCXFP_m<dag argsImm, dag argsReg32, dag argsReg64, dag cgArgs> {
+  defvar cp = (args imm_coproc:$cp);
+  let pnt = PNT_None, params = [u32] in {
+    def "" : CDEIntrinsic<u32, !con(cp, argsReg32, argsImm),
+          (bitcast !con((CDEIRInt<NAME, [f32]> $cp), cgArgs, (? $imm)),
+                   Scalar)>;
+    def a  : CDEIntrinsic<u32, !con(cp, (args u32:$acc), argsReg32, argsImm),
+          (bitcast !con((CDEIRInt<NAME # "a", [f32]> $cp,
+                         (bitcast $acc, FScalar)), cgArgs, (? $imm)), Scalar)>;
+  }
+  let pnt = PNT_None, params = [u64] in {
+    def d  : CDEIntrinsic<u64, !con(cp, argsReg64, argsImm),
+          (bitcast !con((CDEIRInt<NAME, [f64]> $cp), cgArgs, (? $imm)),
+                   Scalar)>;
+    def da : CDEIntrinsic<u64, !con(cp, (args u64:$acc), argsReg64, argsImm),
+          (bitcast !con((CDEIRInt<NAME # "a", [f64]> $cp,
+                         (bitcast $acc, FScalar)), cgArgs, (? $imm)), Scalar)>;
+  }
+}
+
+defm vcx1: CDE_VCXFP_m<(args imm_11b:$imm), (args), (args), (?)>;
+defm vcx2: CDE_VCXFP_m<(args imm_6b:$imm), (args u32:$n), (args u64:$n),
+                       (? (bitcast $n, FScalar))>;
+defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm),
+                       (args u32:$n, u32:$m), (args u64:$n, u64:$m),
+                       (? (bitcast $n, FScalar), (bitcast $m, FScalar))>;
diff --git a/clang/test/CodeGen/arm-cde-vfp.c b/clang/test/CodeGen/arm-cde-vfp.c
new file mode 100644
index 000000000000..fffcb716359d
--- /dev/null
+++ b/clang/test/CodeGen/arm-cde-vfp.c
@@ -0,0 +1,145 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \
+// RUN:   -target-feature +cdecp0 -target-feature +cdecp1 \
+// RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
+// RUN:   -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_cde.h>
+
+// CHECK-LABEL: @test_vcx1_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.cde.vcx1.f32(i32 0, i32 11)
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_vcx1_u32(void) {
+  return __arm_vcx1_u32(0, 11);
+}
+
+// CHECK-LABEL: @test_vcx1a_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.arm.cde.vcx1a.f32(i32 1, float [[TMP0]], i32 12)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vcx1a_u32(uint32_t acc) {
+  return __arm_vcx1a_u32(1, acc, 12);
+}
+
+// CHECK-LABEL: @test_vcx2_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[N:%.*]] to float
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.arm.cde.vcx2.f32(i32 0, float [[TMP0]], i32 21)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vcx2_u32(uint32_t n) {
+  return __arm_vcx2_u32(0, n, 21);
+}
+
+// CHECK-LABEL: @test_vcx2a_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[N:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.cde.vcx2a.f32(i32 0, float [[TMP0]], float [[TMP1]], i32 22)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+uint32_t test_vcx2a_u32(uint32_t acc, uint32_t n) {
+  return __arm_vcx2a_u32(0, acc, n, 22);
+}
+
+// CHECK-LABEL: @test_vcx3_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[N:%.*]] to float
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[M:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.cde.vcx3.f32(i32 1, float [[TMP0]], float [[TMP1]], i32 3)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+uint32_t test_vcx3_u32(uint32_t n, uint32_t m) {
+  return __arm_vcx3_u32(1, n, m, 3);
+}
+
+// CHECK-LABEL: @test_vcx3a_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[N:%.*]] to float
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[M:%.*]] to float
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.arm.cde.vcx3a.f32(i32 0, float [[TMP0]], float [[TMP1]], float [[TMP2]], i32 5)
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
+uint32_t test_vcx3a_u32(uint32_t acc, uint32_t n, uint32_t m) {
+  return __arm_vcx3a_u32(0, acc, n, m, 5);
+}
+
+// CHECK-LABEL: @test_vcx1d_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.arm.cde.vcx1.f64(i32 0, i32 11)
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_vcx1d_u64(void) {
+  return __arm_vcx1d_u64(0, 11);
+}
+
+// CHECK-LABEL: @test_vcx1da_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.arm.cde.vcx1a.f64(i32 1, double [[TMP0]], i32 12)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
+// CHECK-NEXT:    ret i64 [[TMP2]]
+//
+uint64_t test_vcx1da_u64(uint64_t acc) {
+  return __arm_vcx1da_u64(1, acc, 12);
+}
+
+// CHECK-LABEL: @test_vcx2d_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[N:%.*]] to double
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.arm.cde.vcx2.f64(i32 0, double [[TMP0]], i32 21)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
+// CHECK-NEXT:    ret i64 [[TMP2]]
+//
+uint64_t test_vcx2d_u64(uint64_t n) {
+  return __arm_vcx2d_u64(0, n, 21);
+}
+
+// CHECK-LABEL: @test_vcx2da_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[N:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.arm.cde.vcx2a.f64(i32 0, double [[TMP0]], double [[TMP1]], i32 22)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+// CHECK-NEXT:    ret i64 [[TMP3]]
+//
+uint64_t test_vcx2da_u64(uint64_t acc, uint64_t n) {
+  return __arm_vcx2da_u64(0, acc, n, 22);
+}
+
+// CHECK-LABEL: @test_vcx3d_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[N:%.*]] to double
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[M:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.arm.cde.vcx3.f64(i32 1, double [[TMP0]], double [[TMP1]], i32 3)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+// CHECK-NEXT:    ret i64 [[TMP3]]
+//
+uint64_t test_vcx3d_u64(uint64_t n, uint64_t m) {
+  return __arm_vcx3d_u64(1, n, m, 3);
+}
+
+// CHECK-LABEL: @test_vcx3da_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[N:%.*]] to double
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[M:%.*]] to double
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.arm.cde.vcx3a.f64(i32 0, double [[TMP0]], double [[TMP1]], double [[TMP2]], i32 5)
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+// CHECK-NEXT:    ret i64 [[TMP4]]
+//
+uint64_t test_vcx3da_u64(uint64_t acc, uint64_t n, uint64_t m) {
+  return __arm_vcx3da_u64(0, acc, n, m, 5);
+}
diff --git a/clang/test/Sema/arm-cde-immediates.c b/clang/test/Sema/arm-cde-immediates.c
index d521e099c7d1..19159f9be4ea 100644
--- a/clang/test/Sema/arm-cde-immediates.c
+++ b/clang/test/Sema/arm-cde-immediates.c
@@ -63,3 +63,43 @@ void test_cx(uint32_t a, uint64_t da, uint32_t n, uint32_t m) {
   __arm_cx3da(0, da, n, m, a);  // expected-error {{argument to '__arm_cx3da' must be a constant integer}}
   __arm_cx3da(0, da, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
 }
+
+void test_vcxfp_u32(uint32_t a, uint32_t n, uint32_t m) {
+  (void)__arm_vcx1_u32(0, 0);
+  __arm_vcx1_u32(0, a);        // expected-error {{argument to '__arm_vcx1_u32' must be a constant integer}}
+  __arm_vcx1_u32(0, 2048);     // expected-error {{argument value 2048 is outside the valid range [0, 2047]}}
+  __arm_vcx1a_u32(0, a, a);    // expected-error {{argument to '__arm_vcx1a_u32' must be a constant integer}}
+  __arm_vcx1a_u32(0, a, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}}
+
+  (void)__arm_vcx2_u32(0, n, 0);
+  __arm_vcx2_u32(0, n, a);      // expected-error {{argument to '__arm_vcx2_u32' must be a constant integer}}
+  __arm_vcx2_u32(0, n, 64);     // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  __arm_vcx2a_u32(0, a, n, a);  // expected-error {{argument to '__arm_vcx2a_u32' must be a constant integer}}
+  __arm_vcx2a_u32(0, a, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+
+  (void)__arm_vcx3_u32(0, n, m, 0);
+  __arm_vcx3_u32(0, n, m, a);     // expected-error {{argument to '__arm_vcx3_u32' must be a constant integer}}
+  __arm_vcx3_u32(0, n, m, 8);     // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  __arm_vcx3a_u32(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3a_u32' must be a constant integer}}
+  __arm_vcx3a_u32(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_vcxfp_u64(uint64_t a, uint64_t n, uint64_t m) {
+  (void)__arm_vcx1d_u64(0, 0);
+  __arm_vcx1d_u64(0, a);        // expected-error {{argument to '__arm_vcx1d_u64' must be a constant integer}}
+  __arm_vcx1d_u64(0, 2048);     // expected-error {{argument value 2048 is outside the valid range [0, 2047]}}
+  __arm_vcx1da_u64(0, a, a);    // expected-error {{argument to '__arm_vcx1da_u64' must be a constant integer}}
+  __arm_vcx1da_u64(0, a, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}}
+
+  (void)__arm_vcx2d_u64(0, n, 0);
+  __arm_vcx2d_u64(0, n, a);      // expected-error {{argument to '__arm_vcx2d_u64' must be a constant integer}}
+  __arm_vcx2d_u64(0, n, 64);     // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  __arm_vcx2da_u64(0, a, n, a);  // expected-error {{argument to '__arm_vcx2da_u64' must be a constant integer}}
+  __arm_vcx2da_u64(0, a, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+
+  (void)__arm_vcx3d_u64(0, n, m, 0);
+  __arm_vcx3d_u64(0, n, m, a);     // expected-error {{argument to '__arm_vcx3d_u64' must be a constant integer}}
+  __arm_vcx3d_u64(0, n, m, 8);     // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  __arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}}
+  __arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index f75f5000f0f6..076b491ff94f 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1995,6 +1995,9 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) {
     const ScalarType *ST = kv.second.get();
     if (ST->hasNonstandardName())
       continue;
+    // We don't have float64x2_t
+    if (ST->kind() == ScalarTypeKind::Float && ST->sizeInBits() == 64)
+      continue;
     raw_ostream &OS = parts[ST->requiresFloat() ? MVEFloat : MVE];
     const VectorType *VT = getVectorType(ST);
 
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index ba0cf909e5de..de2e6a39abeb 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1301,4 +1301,20 @@ defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>;
 defm int_arm_cde_cx2: CDEGPRIntrinsics<[llvm_i32_ty]>;
 defm int_arm_cde_cx3: CDEGPRIntrinsics<[llvm_i32_ty, llvm_i32_ty]>;
 
+multiclass CDEVCXIntrinsics<list<LLVMType> args> {
+  def "" : Intrinsic<
+    [llvm_anyfloat_ty],
+    !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+  def a : Intrinsic<
+    [llvm_anyfloat_ty],
+    !listconcat([llvm_i32_ty /* coproc */,  LLVMMatchType<0> /* acc */],
+                args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+}
+
+defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>;
+defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
+defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;
+
 } // end TargetPrefix
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index 648911acd26c..9497e1733689 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -542,3 +542,42 @@ def CDE_VCX3_fpdp   : CDE_VCX3_FP_Instr_D<"vcx3",  cde_vcx_params_d_noacc>;
 def CDE_VCX3A_fpdp  : CDE_VCX3_FP_Instr_D<"vcx3a", cde_vcx_params_d_acc>;
 def CDE_VCX3_vec    : CDE_VCX3_Vec_Instr<"vcx3",   cde_vcx_params_q_noacc>;
 def CDE_VCX3A_vec   : CDE_VCX3_Vec_Instr<"vcx3a",  cde_vcx_params_q_acc>;
+
+
+let Predicates = [HasCDE, HasFPRegs] in {
+  def : Pat<(f32 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)),
+            (f32 (CDE_VCX1_fpsp p_imm:$coproc, imm_11b:$imm))>;
+  def : Pat<(f32 (int_arm_cde_vcx1a timm:$coproc, (f32 SPR:$acc), timm:$imm)),
+            (f32 (CDE_VCX1A_fpsp p_imm:$coproc, SPR:$acc, imm_11b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)),
+            (f64 (CDE_VCX1_fpdp p_imm:$coproc, imm_11b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx1a timm:$coproc, (f64 DPR:$acc), timm:$imm)),
+            (f64 (CDE_VCX1A_fpdp p_imm:$coproc, DPR:$acc, imm_11b:$imm))>;
+
+  def : Pat<(f32 (int_arm_cde_vcx2 timm:$coproc, (f32 SPR:$n), timm:$imm)),
+            (f32 (CDE_VCX2_fpsp p_imm:$coproc, SPR:$n, imm_6b:$imm))>;
+  def : Pat<(f32 (int_arm_cde_vcx2a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n),
+                                    timm:$imm)),
+            (f32 (CDE_VCX2A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, imm_6b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx2 timm:$coproc, (f64 DPR:$n), timm:$imm)),
+            (f64 (CDE_VCX2_fpdp p_imm:$coproc, DPR:$n, imm_6b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx2a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n),
+                                    timm:$imm)),
+            (f64 (CDE_VCX2A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, imm_6b:$imm))>;
+
+  def : Pat<(f32 (int_arm_cde_vcx3 timm:$coproc, (f32 SPR:$n), (f32 SPR:$m),
+                                   timm:$imm)),
+            (f32 (CDE_VCX3_fpsp p_imm:$coproc, (f32 SPR:$n), (f32 SPR:$m),
+                                imm_3b:$imm))>;
+  def : Pat<(f32 (int_arm_cde_vcx3a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n),
+                                    (f32 SPR:$m), timm:$imm)),
+            (f32 (CDE_VCX3A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, SPR:$m,
+                                 imm_3b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx3 timm:$coproc, (f64 DPR:$n), (f64 DPR:$m),
+                                   timm:$imm)),
+            (f64 (CDE_VCX3_fpdp p_imm:$coproc, DPR:$n, DPR:$m, imm_3b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx3a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n),
+                                    (f64 DPR:$m), timm:$imm)),
+            (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
+                                 imm_3b:$imm))>;
+}
diff --git a/llvm/test/CodeGen/Thumb2/cde-vfp.ll b/llvm/test/CodeGen/Thumb2/cde-vfp.ll
new file mode 100644
index 000000000000..54ee1d516661
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/cde-vfp.ll
@@ -0,0 +1,198 @@
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+fp-armv8d16sp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare float @llvm.arm.cde.vcx1.f32(i32 immarg, i32 immarg)
+declare float @llvm.arm.cde.vcx1a.f32(i32 immarg, float, i32 immarg)
+declare float @llvm.arm.cde.vcx2.f32(i32 immarg, float, i32 immarg)
+declare float @llvm.arm.cde.vcx2a.f32(i32 immarg, float, float, i32 immarg)
+declare float @llvm.arm.cde.vcx3.f32(i32 immarg, float, float, i32 immarg)
+declare float @llvm.arm.cde.vcx3a.f32(i32 immarg, float, float, float, i32 immarg)
+
+declare double @llvm.arm.cde.vcx1.f64(i32 immarg, i32 immarg)
+declare double @llvm.arm.cde.vcx1a.f64(i32 immarg, double, i32 immarg)
+declare double @llvm.arm.cde.vcx2.f64(i32 immarg, double, i32 immarg)
+declare double @llvm.arm.cde.vcx2a.f64(i32 immarg, double, double, i32 immarg)
+declare double @llvm.arm.cde.vcx3.f64(i32 immarg, double, double, i32 immarg)
+declare double @llvm.arm.cde.vcx3a.f64(i32 immarg, double, double, double, i32 immarg)
+
+define arm_aapcs_vfpcc i32 @test_vcx1_u32() {
+; CHECK-LABEL: test_vcx1_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx1 p0, s0, #11
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call float @llvm.arm.cde.vcx1.f32(i32 0, i32 11)
+  %1 = bitcast float %0 to i32
+  ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vcx1a_u32(i32 %acc) {
+; CHECK-LABEL: test_vcx1a_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcx1a p1, s0, #12
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32 %acc to float
+  %1 = call float @llvm.arm.cde.vcx1a.f32(i32 1, float %0, i32 12)
+  %2 = bitcast float %1 to i32
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vcx2_u32(i32 %n) {
+; CHECK-LABEL: test_vcx2_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcx2 p0, s0, s0, #21
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32 %n to float
+  %1 = call float @llvm.arm.cde.vcx2.f32(i32 0, float %0, i32 21)
+  %2 = bitcast float %1 to i32
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vcx2a_u32(i32 %acc, i32 %n) {
+; CHECK-LABEL: test_vcx2a_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vcx2a p0, s2, s0, #22
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32 %acc to float
+  %1 = bitcast i32 %n to float
+  %2 = call float @llvm.arm.cde.vcx2a.f32(i32 0, float %0, float %1, i32 22)
+  %3 = bitcast float %2 to i32
+  ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vcx3_u32(i32 %n, i32 %m) {
+; CHECK-LABEL: test_vcx3_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vcx3 p1, s0, s2, s0, #3
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32 %n to float
+  %1 = bitcast i32 %m to float
+  %2 = call float @llvm.arm.cde.vcx3.f32(i32 1, float %0, float %1, i32 3)
+  %3 = bitcast float %2 to i32
+  ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vcx3a_u32(i32 %acc, i32 %n, i32 %m) {
+; CHECK-LABEL: test_vcx3a_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r2
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    vmov s4, r0
+; CHECK-NEXT:    vcx3a p0, s4, s2, s0, #5
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32 %acc to float
+  %1 = bitcast i32 %n to float
+  %2 = bitcast i32 %m to float
+  %3 = call float @llvm.arm.cde.vcx3a.f32(i32 0, float %0, float %1, float %2, i32 5)
+  %4 = bitcast float %3 to i32
+  ret i32 %4
+}
+
+define arm_aapcs_vfpcc i64 @test_vcx1d_u64() {
+; CHECK-LABEL: test_vcx1d_u64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx1 p0, d0, #11
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call double @llvm.arm.cde.vcx1.f64(i32 0, i32 11)
+  %1 = bitcast double %0 to i64
+  ret i64 %1
+}
+
+define arm_aapcs_vfpcc i64 @test_vcx1da_u64(i64 %acc) {
+; CHECK-LABEL: test_vcx1da_u64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vcx1a p1, d0, #12
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i64 %acc to double
+  %1 = call double @llvm.arm.cde.vcx1a.f64(i32 1, double %0, i32 12)
+  %2 = bitcast double %1 to i64
+  ret i64 %2
+}
+
+define arm_aapcs_vfpcc i64 @test_vcx2d_u64(i64 %n) {
+; CHECK-LABEL: test_vcx2d_u64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vcx2 p0, d0, d0, #21
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i64 %n to double
+  %1 = call double @llvm.arm.cde.vcx2.f64(i32 0, double %0, i32 21)
+  %2 = bitcast double %1 to i64
+  ret i64 %2
+}
+
+define arm_aapcs_vfpcc i64 @test_vcx2da_u64(i64 %acc, i64 %n) {
+; CHECK-LABEL: test_vcx2da_u64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vmov d1, r0, r1
+; CHECK-NEXT:    vcx2a p0, d1, d0, #22
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i64 %acc to double
+  %1 = bitcast i64 %n to double
+  %2 = call double @llvm.arm.cde.vcx2a.f64(i32 0, double %0, double %1, i32 22)
+  %3 = bitcast double %2 to i64
+  ret i64 %3
+}
+
+define arm_aapcs_vfpcc i64 @test_vcx3d_u64(i64 %n, i64 %m) {
+; CHECK-LABEL: test_vcx3d_u64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vmov d1, r0, r1
+; CHECK-NEXT:    vcx3 p1, d0, d1, d0, #3
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i64 %n to double
+  %1 = bitcast i64 %m to double
+  %2 = call double @llvm.arm.cde.vcx3.f64(i32 1, double %0, double %1, i32 3)
+  %3 = bitcast double %2 to i64
+  ret i64 %3
+}
+
+define arm_aapcs_vfpcc i64 @test_vcx3da_u64(i64 %acc, i64 %n, i64 %m) {
+; CHECK-LABEL: test_vcx3da_u64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    ldrd lr, r12, [sp, #8]
+; CHECK-DAG:     vmov [[D0:d.*]], r0, r1
+; CHECK-DAG:     vmov [[D1:d.*]], r2, r3
+; CHECK-DAG:     vmov [[D2:d.*]], lr, r12
+; CHECK-NEXT:    vcx3a p0, [[D0]], [[D1]], [[D2]], #5
+; CHECK-NEXT:    vmov r0, r1, [[D0]]
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i64 %acc to double
+  %1 = bitcast i64 %n to double
+  %2 = bitcast i64 %m to double
+  %3 = call double @llvm.arm.cde.vcx3a.f64(i32 0, double %0, double %1, double %2, i32 5)
+  %4 = bitcast double %3 to i64
+  ret i64 %4
+}