[ARM,CDE] Implement CDE unpredicated Q-register intrinsics

Summary:
This patch implements the following intrinsics:

  uint8x16_t __arm_vcx1q_u8 (int coproc, uint32_t imm);
  T __arm_vcx1qa(int coproc, T acc, uint32_t imm);
  T __arm_vcx2q(int coproc, T n, uint32_t imm);
  uint8x16_t __arm_vcx2q_u8(int coproc, T n, uint32_t imm);
  T __arm_vcx2qa(int coproc, T acc, U n, uint32_t imm);
  T __arm_vcx3q(int coproc, T n, U m, uint32_t imm);
  uint8x16_t __arm_vcx3q_u8(int coproc, T n, U m, uint32_t imm);
  T __arm_vcx3qa(int coproc, T acc, U n, V m, uint32_t imm);

Most of them are polymorphic. Furthermore, some intrinsics are
polymorphic by 2 or 3 parameter types, such polymorphism is not
supported by the existing MVE/CDE tablegen backends, also we don't
really want to have a combinatorial explosion caused by 1000 different
combinations of 3 vector types. Because of this some intrinsics are
implemented as macros involving a cast of the polymorphic arguments to
uint8x16_t.

The IR intrinsics are even more restricted in terms of types: all MVE
vectors are cast to v16i8.

Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard

Reviewed By: MarkMurrayARM

Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D76299
This commit is contained in:
Mikhail Maltsev 2020-03-20 14:01:56 +00:00
parent d22e661712
commit 969034b860
7 changed files with 405 additions and 1 deletions

View File

@ -37,6 +37,13 @@ class CDEImmediateBits<int numBits> : Immediate<u32, IB_ConstBits<numBits>>;
class CDEIRInt<string name, list<Type> params = [], bit appendKind = 0> class CDEIRInt<string name, list<Type> params = [], bit appendKind = 0>
: IRIntBase<"arm_cde_" # name, params, appendKind>; : IRIntBase<"arm_cde_" # name, params, appendKind>;
// Class for generating function macros in arm_cde.h:
// "#define <name>(<params>) <definition>"
class FunctionMacro<list<string> params_, string definition_> {
list<string> params = params_;
string definition = definition_;
}
// Coprocessor immediate // Coprocessor immediate
def imm_coproc : Immediate<sint, IB_ConstRange<0, 7>>; def imm_coproc : Immediate<sint, IB_ConstRange<0, 7>>;
@ -107,3 +114,77 @@ defm vcx2: CDE_VCXFP_m<(args imm_6b:$imm), (args u32:$n), (args u64:$n),
defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm), defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm),
(args u32:$n, u32:$m), (args u64:$n, u64:$m), (args u32:$n, u32:$m), (args u64:$n, u64:$m),
(? (bitcast $n, FScalar), (bitcast $m, FScalar))>; (? (bitcast $n, FScalar), (bitcast $m, FScalar))>;
// VCX* instructions operating on Q vector registers
def v16u8 : VecOf<u8>;
let pnt = PNT_None, params = [u8] in
def vcx1q : CDEIntrinsic<Vector, (args imm_coproc:$cp, imm_12b:$imm),
(CDEIRInt<"vcx1q"> $cp, $imm)>;
let pnt = PNT_Type, params = T.All, polymorphicOnly = 1 in {
def vcx1qa :
CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$acc, imm_12b:$imm),
(bitcast (CDEIRInt<"vcx1qa"> $cp, (bitcast $acc, v16u8), $imm),
Vector)>;
def vcx2q :
CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
(bitcast (CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm),
Vector)>;
def vcx2q_u8 :
CDEIntrinsic<v16u8, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
(CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm)>;
def vcx2qa_impl :
CDEIntrinsic<Vector,
(args imm_coproc:$cp, Vector:$acc, v16u8:$n, imm_7b:$imm),
(bitcast (CDEIRInt<"vcx2qa"> $cp, (bitcast $acc, v16u8), $n, $imm),
Vector)>;
def vcx3q_impl :
CDEIntrinsic<Vector,
(args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
(bitcast (CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm),
Vector)>;
def vcx3q_u8_impl :
CDEIntrinsic<v16u8,
(args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
(CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm)>;
def vcx3qa_impl :
CDEIntrinsic<Vector,
(args imm_coproc:$cp, Vector:$acc, v16u8:$n, v16u8:$m, imm_4b:$imm),
(bitcast (CDEIRInt<"vcx3qa"> $cp, (bitcast $acc, v16u8), $n, $m,
$imm),
Vector)>;
}
// Reinterpret intrinsics required to implement __arm_vcx*q with 2 or 3
// polymorphic paramters.
let params = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32],
headerOnly = 1, polymorphicOnly = 1 in
def vreinterpretq_u8 :
Intrinsic<v16u8, (args Vector:$x), (vreinterpret $x, v16u8)>;
// We need vreinterpretq_u8_u8 to avoid doing smart tricks in the macros
let params = [u8], polymorphicOnly = 1 in
def vreinterpretq_u8_cde :
CDEIntrinsic<v16u8, (args Vector:$x), (id $x)>,
NameOverride<"vreinterpretq_u8">;
def vcx2qa : FunctionMacro<
["cp", "acc", "n", "imm"],
"__arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))">;
def vcx3q : FunctionMacro<
["cp", "n", "m", "imm"],
"__arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
def vcx3q_u8 : FunctionMacro<
["cp", "n", "m", "imm"],
"__arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
def vcx3qa : FunctionMacro<
["cp", "acc", "n", "m", "imm"],
"__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), "
"__arm_vreinterpretq_u8(m), (imm))">;

View File

@ -0,0 +1,104 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \
// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \
// RUN: -target-feature +mve.fp \
// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \
// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
#include <arm_cde.h>
// CHECK-LABEL: @test_vcx1q_u8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vcx1q_u8(void) {
return __arm_vcx1q_u8(0, 1111);
}
// CHECK-LABEL: @test_vcx1qa_1(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> [[ACC:%.*]], i32 1112)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vcx1qa_1(uint8x16_t acc) {
return __arm_vcx1qa(1, acc, 1112);
}
// CHECK-LABEL: @test_vcx1qa_2(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[ACC:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> [[TMP0]], i32 1113)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vcx1qa_2(int32x4_t acc) {
return __arm_vcx1qa(0, acc, 1113);
}
// CHECK-LABEL: @test_vcx2q_u8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 111)
// CHECK-NEXT: ret <16 x i8> [[TMP1]]
//
uint8x16_t test_vcx2q_u8(float16x8_t n) {
return __arm_vcx2q_u8(1, n, 111);
}
// CHECK-LABEL: @test_vcx2q(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 112)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
float32x4_t test_vcx2q(float32x4_t n) {
return __arm_vcx2q(1, n, 112);
}
// CHECK-LABEL: @test_vcx2qa(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 113)
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_vcx2qa(float32x4_t acc, int64x2_t n) {
return __arm_vcx2qa(0, acc, n, 113);
}
// CHECK-LABEL: @test_vcx3q_u8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 11)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vcx3q_u8(uint16x8_t n, int32x4_t m) {
return __arm_vcx3q_u8(0, n, m, 11);
}
// CHECK-LABEL: @test_vcx3q(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12)
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
// CHECK-NEXT: ret <2 x i64> [[TMP3]]
//
uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) {
return __arm_vcx3q(1, n, m, 12);
}
// CHECK-LABEL: @test_vcx3qa(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> [[ACC:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 13)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) {
return __arm_vcx3qa(1, acc, n, m, 13);
}

View File

@ -103,3 +103,27 @@ void test_vcxfp_u64(uint64_t a, uint64_t n, uint64_t m) {
__arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}} __arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}}
__arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} __arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
} }
void test_vcxq(uint32_t a, uint8x16_t acc, float16x8_t n, int64x2_t m) {
(void)__arm_vcx1q_u8(0, 0);
__arm_vcx1q_u8(0, a); // expected-error {{argument to '__arm_vcx1q_u8' must be a constant integer}}
__arm_vcx1q_u8(0, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
__arm_vcx1qa(0, acc, a); // expected-error {{argument to '__arm_vcx1qa' must be a constant integer}}
__arm_vcx1qa(0, acc, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
(void)__arm_vcx2q_u8(0, n, 0);
__arm_vcx2q_u8(0, n, a); // expected-error {{argument to '__arm_vcx2q_u8' must be a constant integer}}
__arm_vcx2q_u8(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
__arm_vcx2q(0, n, a); // expected-error {{argument to '__arm_vcx2q' must be a constant integer}}
__arm_vcx2q(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
__arm_vcx2qa(0, n, acc, a); // expected-error {{argument to '__arm_vcx2qa_impl' must be a constant integer}}
__arm_vcx2qa(0, n, acc, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
(void)__arm_vcx3q_u8(0, n, m, 0);
__arm_vcx3q_u8(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_u8_impl' must be a constant integer}}
__arm_vcx3q_u8(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
__arm_vcx3q(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_impl' must be a constant integer}}
__arm_vcx3q(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
__arm_vcx3qa(0, n, m, acc, a); // expected-error {{argument to '__arm_vcx3qa_impl' must be a constant integer}}
__arm_vcx3qa(0, n, m, acc, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
}

View File

@ -1962,18 +1962,48 @@ void MveEmitter::EmitBuiltinSema(raw_ostream &OS) {
} }
} }
// -----------------------------------------------------------------------------
// Class that describes an ACLE intrinsic implemented as a macro.
//
// This class is used when the intrinsic is polymorphic in 2 or 3 types, but we
// want to avoid a combinatorial explosion by reinterpreting the arguments to
// fixed types.
class FunctionMacro {
std::vector<StringRef> Params;
StringRef Definition;
public:
FunctionMacro(const Record &R);
const std::vector<StringRef> &getParams() const { return Params; }
StringRef getDefinition() const { return Definition; }
};
FunctionMacro::FunctionMacro(const Record &R) {
Params = R.getValueAsListOfStrings("params");
Definition = R.getValueAsString("definition");
}
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// The class used for generating arm_cde.h and related Clang bits // The class used for generating arm_cde.h and related Clang bits
// //
class CdeEmitter : public EmitterBase { class CdeEmitter : public EmitterBase {
std::map<StringRef, FunctionMacro> FunctionMacros;
public: public:
CdeEmitter(RecordKeeper &Records) : EmitterBase(Records){}; CdeEmitter(RecordKeeper &Records);
void EmitHeader(raw_ostream &OS) override; void EmitHeader(raw_ostream &OS) override;
void EmitBuiltinDef(raw_ostream &OS) override; void EmitBuiltinDef(raw_ostream &OS) override;
void EmitBuiltinSema(raw_ostream &OS) override; void EmitBuiltinSema(raw_ostream &OS) override;
}; };
CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) {
for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
FunctionMacros.emplace(R->getName(), FunctionMacro(*R));
}
void CdeEmitter::EmitHeader(raw_ostream &OS) { void CdeEmitter::EmitHeader(raw_ostream &OS) {
// Accumulate pieces of the header file that will be enabled under various // Accumulate pieces of the header file that will be enabled under various
// different combinations of #ifdef. The index into parts[] is one of the // different combinations of #ifdef. The index into parts[] is one of the
@ -2051,6 +2081,16 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) {
} }
} }
for (const auto &kv : FunctionMacros) {
StringRef Name = kv.first;
const FunctionMacro &FM = kv.second;
raw_ostream &OS = parts[MVE];
OS << "#define "
<< "__arm_" << Name << "(" << join(FM.getParams(), ", ") << ") "
<< FM.getDefinition() << "\n";
}
for (auto &part : parts) for (auto &part : parts)
part << "\n"; part << "\n";

View File

@ -1317,4 +1317,20 @@ defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>;
defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>; defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>; defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;
multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
def "" : Intrinsic<
[llvm_v16i8_ty],
!listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
def a : Intrinsic<
[llvm_v16i8_ty],
!listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
args, [llvm_i32_ty /* imm */]),
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
}
defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;
defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>;
defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>;
} // end TargetPrefix } // end TargetPrefix

View File

@ -581,3 +581,28 @@ let Predicates = [HasCDE, HasFPRegs] in {
(f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m, (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
imm_3b:$imm))>; imm_3b:$imm))>;
} }
let Predicates = [HasCDE, HasMVEInt] in {
def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
(v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
timm:$imm)),
(v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
(v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
(v16i8 MQPR:$n), timm:$imm)),
(v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
imm_7b:$imm))>;
def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
(v16i8 MQPR:$m), timm:$imm)),
(v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
imm_4b:$imm))>;
def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
(v16i8 MQPR:$n), (v16i8 MQPR:$m),
timm:$imm)),
(v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
imm_4b:$imm))>;
}

View File

@ -0,0 +1,114 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg)
declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg)
define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() {
; CHECK-LABEL: test_vcx1q_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx1 p0, q0, #1111
; CHECK-NEXT: bx lr
entry:
%0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) {
; CHECK-LABEL: test_vcx1qa_1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx1a p1, q0, #1112
; CHECK-NEXT: bx lr
entry:
%0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112)
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) {
; CHECK-LABEL: test_vcx1qa_2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx1a p0, q0, #1113
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <4 x i32> %acc to <16 x i8>
%1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113)
%2 = bitcast <16 x i8> %1 to <4 x i32>
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) {
; CHECK-LABEL: test_vcx2q_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx2 p1, q0, q0, #111
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x half> %n to <16 x i8>
%1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111)
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) {
; CHECK-LABEL: test_vcx2q:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx2 p1, q0, q0, #112
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <4 x float> %n to <16 x i8>
%1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112)
%2 = bitcast <16 x i8> %1 to <4 x float>
ret <4 x float> %2
}
define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) {
; CHECK-LABEL: test_vcx2qa:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx2a p0, q0, q1, #113
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <4 x float> %acc to <16 x i8>
%1 = bitcast <2 x i64> %n to <16 x i8>
%2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113)
%3 = bitcast <16 x i8> %2 to <4 x float>
ret <4 x float> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) {
; CHECK-LABEL: test_vcx3q_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx3 p0, q0, q0, q1, #11
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x i16> %n to <16 x i8>
%1 = bitcast <4 x i32> %m to <16 x i8>
%2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) {
; CHECK-LABEL: test_vcx3q:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx3 p1, q0, q0, q1, #12
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <2 x i64> %n to <16 x i8>
%1 = bitcast <4 x float> %m to <16 x i8>
%2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12)
%3 = bitcast <16 x i8> %2 to <2 x i64>
ret <2 x i64> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) {
; CHECK-LABEL: test_vcx3qa:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcx3a p1, q0, q1, q2, #13
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x i16> %n to <16 x i8>
%1 = bitcast <4 x float> %m to <16 x i8>
%2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
ret <16 x i8> %2
}