[ARM,CDE] Implement CDE unpredicated Q-register intrinsics

Summary: This patch implements the following intrinsics: uint8x16_t __arm_vcx1q_u8 (int coproc, uint32_t imm); T __arm_vcx1qa(int coproc, T acc, uint32_t imm); T __arm_vcx2q(int coproc, T n, uint32_t imm); uint8x16_t __arm_vcx2q_u8(int coproc, T n, uint32_t imm); T __arm_vcx2qa(int coproc, T acc, U n, uint32_t imm); T __arm_vcx3q(int coproc, T n, U m, uint32_t imm); uint8x16_t __arm_vcx3q_u8(int coproc, T n, U m, uint32_t imm); T __arm_vcx3qa(int coproc, T acc, U n, V m, uint32_t imm); Most of them are polymorphic. Furthermore, some intrinsics are polymorphic by 2 or 3 parameter types, such polymorphism is not supported by the existing MVE/CDE tablegen backends, also we don't really want to have a combinatorial explosion caused by 1000 different combinations of 3 vector types. Because of this some intrinsics are implemented as macros involving a cast of the polymorphic arguments to uint8x16_t. The IR intrinsics are even more restricted in terms of types: all MVE vectors are cast to v16i8. Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76299
2020-03-20 14:01:56 +00:00 · 2020-03-20 14:01:56 +00:00 · 969034b860
parent d22e661712
commit 969034b860
7 changed files with 405 additions and 1 deletions
--- a/clang/include/clang/Basic/arm_cde.td
+++ b/clang/include/clang/Basic/arm_cde.td
@ -37,6 +37,13 @@ class CDEImmediateBits<int numBits> : Immediate<u32, IB_ConstBits<numBits>>;
 class CDEIRInt<string name, list<Type> params = [], bit appendKind = 0>
      : IRIntBase<"arm_cde_" # name, params, appendKind>;
 // Class for generating function macros in arm_cde.h:
 // "#define <name>(<params>) <definition>"
 class FunctionMacro<list<string> params_, string definition_> {
  list<string> params = params_;
  string definition = definition_;
 }
 // Coprocessor immediate
 def imm_coproc : Immediate<sint, IB_ConstRange<0, 7>>;
@ -107,3 +114,77 @@ defm vcx2: CDE_VCXFP_m<(args imm_6b:$imm), (args u32:$n), (args u64:$n),
 defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm),
                       (args u32:$n, u32:$m), (args u64:$n, u64:$m),
                       (? (bitcast $n, FScalar), (bitcast $m, FScalar))>;
 // VCX* instructions operating on Q vector registers
 def v16u8 : VecOf<u8>;
 let pnt = PNT_None, params = [u8] in
 def vcx1q : CDEIntrinsic<Vector, (args imm_coproc:$cp, imm_12b:$imm),
                         (CDEIRInt<"vcx1q"> $cp, $imm)>;
 let pnt = PNT_Type, params = T.All, polymorphicOnly = 1 in {
  def vcx1qa :
    CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$acc, imm_12b:$imm),
            (bitcast (CDEIRInt<"vcx1qa"> $cp, (bitcast $acc, v16u8), $imm),
                     Vector)>;
  def vcx2q :
    CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
            (bitcast (CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm),
                      Vector)>;
  def vcx2q_u8 :
    CDEIntrinsic<v16u8, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
            (CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm)>;
  def vcx2qa_impl :
    CDEIntrinsic<Vector,
            (args imm_coproc:$cp, Vector:$acc, v16u8:$n, imm_7b:$imm),
            (bitcast (CDEIRInt<"vcx2qa"> $cp, (bitcast $acc, v16u8), $n, $imm),
                     Vector)>;
  def vcx3q_impl :
    CDEIntrinsic<Vector,
            (args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
            (bitcast (CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm),
                     Vector)>;
  def vcx3q_u8_impl :
    CDEIntrinsic<v16u8,
            (args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
            (CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm)>;
  def vcx3qa_impl :
    CDEIntrinsic<Vector,
            (args imm_coproc:$cp, Vector:$acc, v16u8:$n, v16u8:$m, imm_4b:$imm),
            (bitcast (CDEIRInt<"vcx3qa"> $cp, (bitcast $acc, v16u8), $n, $m,
                                         $imm),
                     Vector)>;
 }
 // Reinterpret intrinsics required to implement __arm_vcx*q with 2 or 3
 // polymorphic paramters.
 let params = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32],
    headerOnly = 1, polymorphicOnly = 1 in
 def vreinterpretq_u8 :
    Intrinsic<v16u8, (args Vector:$x), (vreinterpret $x, v16u8)>;
 // We need vreinterpretq_u8_u8 to avoid doing smart tricks in the macros
 let params = [u8], polymorphicOnly = 1 in
 def vreinterpretq_u8_cde :
    CDEIntrinsic<v16u8, (args Vector:$x), (id $x)>,
    NameOverride<"vreinterpretq_u8">;
 def vcx2qa : FunctionMacro<
  ["cp", "acc", "n", "imm"],
  "__arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))">;
 def vcx3q : FunctionMacro<
  ["cp", "n", "m", "imm"],
  "__arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
 def vcx3q_u8 : FunctionMacro<
  ["cp", "n", "m", "imm"],
  "__arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
 def vcx3qa : FunctionMacro<
  ["cp", "acc", "n", "m", "imm"],
  "__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), "
                     "__arm_vreinterpretq_u8(m), (imm))">;
--- a/clang/test/CodeGen/arm-cde-vec.c
+++ b/clang/test/CodeGen/arm-cde-vec.c
@ -0,0 +1,104 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \
 // RUN:   -target-feature +cdecp0 -target-feature +cdecp1 \
 // RUN:   -target-feature +mve.fp \
 // RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
 // RUN:   -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 #include <arm_cde.h>
 // CHECK-LABEL: @test_vcx1q_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vcx1q_u8(void) {
  return __arm_vcx1q_u8(0, 1111);
 }
 // CHECK-LABEL: @test_vcx1qa_1(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> [[ACC:%.*]], i32 1112)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vcx1qa_1(uint8x16_t acc) {
  return __arm_vcx1qa(1, acc, 1112);
 }
 // CHECK-LABEL: @test_vcx1qa_2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[ACC:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> [[TMP0]], i32 1113)
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vcx1qa_2(int32x4_t acc) {
  return __arm_vcx1qa(0, acc, 1113);
 }
 // CHECK-LABEL: @test_vcx2q_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 111)
 // CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 //
 uint8x16_t test_vcx2q_u8(float16x8_t n) {
  return __arm_vcx2q_u8(1, n, 111);
 }
 // CHECK-LABEL: @test_vcx2q(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 112)
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vcx2q(float32x4_t n) {
  return __arm_vcx2q(1, n, 112);
 }
 // CHECK-LABEL: @test_vcx2qa(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[ACC:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 113)
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float32x4_t test_vcx2qa(float32x4_t acc, int64x2_t n) {
  return __arm_vcx2qa(0, acc, n, 113);
 }
 // CHECK-LABEL: @test_vcx3q_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 11)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vcx3q_u8(uint16x8_t n, int32x4_t m) {
  return __arm_vcx3q_u8(0, n, m, 11);
 }
 // CHECK-LABEL: @test_vcx3q(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12)
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 //
 uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) {
  return __arm_vcx3q(1, n, m, 12);
 }
 // CHECK-LABEL: @test_vcx3qa(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> [[ACC:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 13)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) {
  return __arm_vcx3qa(1, acc, n, m, 13);
 }
--- a/clang/test/Sema/arm-cde-immediates.c
+++ b/clang/test/Sema/arm-cde-immediates.c
@ -103,3 +103,27 @@ void test_vcxfp_u64(uint64_t a, uint64_t n, uint64_t m) {
  __arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}}
  __arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
 void test_vcxq(uint32_t a, uint8x16_t acc, float16x8_t n, int64x2_t m) {
  (void)__arm_vcx1q_u8(0, 0);
  __arm_vcx1q_u8(0, a);       // expected-error {{argument to '__arm_vcx1q_u8' must be a constant integer}}
  __arm_vcx1q_u8(0, 4096);    // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
  __arm_vcx1qa(0, acc, a);    // expected-error {{argument to '__arm_vcx1qa' must be a constant integer}}
  __arm_vcx1qa(0, acc, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
  (void)__arm_vcx2q_u8(0, n, 0);
  __arm_vcx2q_u8(0, n, a);      // expected-error {{argument to '__arm_vcx2q_u8' must be a constant integer}}
  __arm_vcx2q_u8(0, n, 128);    // expected-error {{argument value 128 is outside the valid range [0, 127]}}
  __arm_vcx2q(0, n, a);         // expected-error {{argument to '__arm_vcx2q' must be a constant integer}}
  __arm_vcx2q(0, n, 128);       // expected-error {{argument value 128 is outside the valid range [0, 127]}}
  __arm_vcx2qa(0, n, acc, a);   // expected-error {{argument to '__arm_vcx2qa_impl' must be a constant integer}}
  __arm_vcx2qa(0, n, acc, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
  (void)__arm_vcx3q_u8(0, n, m, 0);
  __arm_vcx3q_u8(0, n, m, a);     // expected-error {{argument to '__arm_vcx3q_u8_impl' must be a constant integer}}
  __arm_vcx3q_u8(0, n, m, 16);    // expected-error {{argument value 16 is outside the valid range [0, 15]}}
  __arm_vcx3q(0, n, m, a);        // expected-error {{argument to '__arm_vcx3q_impl' must be a constant integer}}
  __arm_vcx3q(0, n, m, 16);       // expected-error {{argument value 16 is outside the valid range [0, 15]}}
  __arm_vcx3qa(0, n, m, acc, a);  // expected-error {{argument to '__arm_vcx3qa_impl' must be a constant integer}}
  __arm_vcx3qa(0, n, m, acc, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
 }
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@ -1962,18 +1962,48 @@ void MveEmitter::EmitBuiltinSema(raw_ostream &OS) {
  }
 }
 // -----------------------------------------------------------------------------
 // Class that describes an ACLE intrinsic implemented as a macro.
 //
 // This class is used when the intrinsic is polymorphic in 2 or 3 types, but we
 // want to avoid a combinatorial explosion by reinterpreting the arguments to
 // fixed types.
 class FunctionMacro {
  std::vector<StringRef> Params;
  StringRef Definition;
 public:
  FunctionMacro(const Record &R);
  const std::vector<StringRef> &getParams() const { return Params; }
  StringRef getDefinition() const { return Definition; }
 };
 FunctionMacro::FunctionMacro(const Record &R) {
  Params = R.getValueAsListOfStrings("params");
  Definition = R.getValueAsString("definition");
 }
 // -----------------------------------------------------------------------------
 // The class used for generating arm_cde.h and related Clang bits
 //
 class CdeEmitter : public EmitterBase {
  std::map<StringRef, FunctionMacro> FunctionMacros;
 public:
-  CdeEmitter(RecordKeeper &Records) : EmitterBase(Records){};
+  CdeEmitter(RecordKeeper &Records);
  void EmitHeader(raw_ostream &OS) override;
  void EmitBuiltinDef(raw_ostream &OS) override;
  void EmitBuiltinSema(raw_ostream &OS) override;
 };
 CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) {
  for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
    FunctionMacros.emplace(R->getName(), FunctionMacro(*R));
 }
 void CdeEmitter::EmitHeader(raw_ostream &OS) {
  // Accumulate pieces of the header file that will be enabled under various
  // different combinations of #ifdef. The index into parts[] is one of the
@ -2051,6 +2081,16 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) {
    }
  }
  for (const auto &kv : FunctionMacros) {
    StringRef Name = kv.first;
    const FunctionMacro &FM = kv.second;
    raw_ostream &OS = parts[MVE];
    OS << "#define "
       << "__arm_" << Name << "(" << join(FM.getParams(), ", ") << ") "
       << FM.getDefinition() << "\n";
  }
  for (auto &part : parts)
    part << "\n";
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@ -1317,4 +1317,20 @@ defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>;
 defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
 defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;
 multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
  def "" : Intrinsic<
    [llvm_v16i8_ty],
    !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
  def a : Intrinsic<
    [llvm_v16i8_ty],
    !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
                args, [llvm_i32_ty /* imm */]),
    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
 }
 defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;
 defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>;
 defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>;
 } // end TargetPrefix
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@ -581,3 +581,28 @@ let Predicates = [HasCDE, HasFPRegs] in {
            (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
                                 imm_3b:$imm))>;
 }
 let Predicates = [HasCDE, HasMVEInt] in {
  def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
            (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
  def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
                                       timm:$imm)),
            (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
  def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
            (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
  def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
                                       (v16i8 MQPR:$n), timm:$imm)),
            (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
                                  imm_7b:$imm))>;
  def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
                                      (v16i8 MQPR:$m), timm:$imm)),
            (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
                                 imm_4b:$imm))>;
  def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
                                       (v16i8 MQPR:$n), (v16i8 MQPR:$m),
                                       timm:$imm)),
            (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
                                  imm_4b:$imm))>;
 }
--- a/llvm/test/CodeGen/Thumb2/cde-vec.ll
+++ b/llvm/test/CodeGen/Thumb2/cde-vec.ll
@ -0,0 +1,114 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
 declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg)
 define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() {
 ; CHECK-LABEL: test_vcx1q_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx1 p0, q0, #1111
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
  ret <16 x i8> %0
 }
 define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) {
 ; CHECK-LABEL: test_vcx1qa_1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx1a p1, q0, #1112
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112)
  ret <16 x i8> %0
 }
 define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) {
 ; CHECK-LABEL: test_vcx1qa_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx1a p0, q0, #1113
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <4 x i32> %acc to <16 x i8>
  %1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113)
  %2 = bitcast <16 x i8> %1 to <4 x i32>
  ret <4 x i32> %2
 }
 define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) {
 ; CHECK-LABEL: test_vcx2q_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx2 p1, q0, q0, #111
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <8 x half> %n to <16 x i8>
  %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111)
  ret <16 x i8> %1
 }
 define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) {
 ; CHECK-LABEL: test_vcx2q:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx2 p1, q0, q0, #112
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <4 x float> %n to <16 x i8>
  %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112)
  %2 = bitcast <16 x i8> %1 to <4 x float>
  ret <4 x float> %2
 }
 define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) {
 ; CHECK-LABEL: test_vcx2qa:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx2a p0, q0, q1, #113
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <4 x float> %acc to <16 x i8>
  %1 = bitcast <2 x i64> %n to <16 x i8>
  %2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113)
  %3 = bitcast <16 x i8> %2 to <4 x float>
  ret <4 x float> %3
 }
 define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) {
 ; CHECK-LABEL: test_vcx3q_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx3 p0, q0, q0, q1, #11
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <8 x i16> %n to <16 x i8>
  %1 = bitcast <4 x i32> %m to <16 x i8>
  %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11)
  ret <16 x i8> %2
 }
 define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) {
 ; CHECK-LABEL: test_vcx3q:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx3 p1, q0, q0, q1, #12
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <2 x i64> %n to <16 x i8>
  %1 = bitcast <4 x float> %m to <16 x i8>
  %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12)
  %3 = bitcast <16 x i8> %2 to <2 x i64>
  ret <2 x i64> %3
 }
 define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) {
 ; CHECK-LABEL: test_vcx3qa:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcx3a p1, q0, q1, q2, #13
 ; CHECK-NEXT:    bx lr
 entry:
  %0 = bitcast <8 x i16> %n to <16 x i8>
  %1 = bitcast <4 x float> %m to <16 x i8>
  %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
  ret <16 x i8> %2
 }