forked from OSchip/llvm-project
[ARM,CDE] Implement CDE unpredicated Q-register intrinsics
Summary: This patch implements the following intrinsics: uint8x16_t __arm_vcx1q_u8 (int coproc, uint32_t imm); T __arm_vcx1qa(int coproc, T acc, uint32_t imm); T __arm_vcx2q(int coproc, T n, uint32_t imm); uint8x16_t __arm_vcx2q_u8(int coproc, T n, uint32_t imm); T __arm_vcx2qa(int coproc, T acc, U n, uint32_t imm); T __arm_vcx3q(int coproc, T n, U m, uint32_t imm); uint8x16_t __arm_vcx3q_u8(int coproc, T n, U m, uint32_t imm); T __arm_vcx3qa(int coproc, T acc, U n, V m, uint32_t imm); Most of them are polymorphic. Furthermore, some intrinsics are polymorphic by 2 or 3 parameter types, such polymorphism is not supported by the existing MVE/CDE tablegen backends, also we don't really want to have a combinatorial explosion caused by 1000 different combinations of 3 vector types. Because of this some intrinsics are implemented as macros involving a cast of the polymorphic arguments to uint8x16_t. The IR intrinsics are even more restricted in terms of types: all MVE vectors are cast to v16i8. Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76299
This commit is contained in:
parent
d22e661712
commit
969034b860
|
@ -37,6 +37,13 @@ class CDEImmediateBits<int numBits> : Immediate<u32, IB_ConstBits<numBits>>;
|
||||||
class CDEIRInt<string name, list<Type> params = [], bit appendKind = 0>
|
class CDEIRInt<string name, list<Type> params = [], bit appendKind = 0>
|
||||||
: IRIntBase<"arm_cde_" # name, params, appendKind>;
|
: IRIntBase<"arm_cde_" # name, params, appendKind>;
|
||||||
|
|
||||||
|
// Class for generating function macros in arm_cde.h:
|
||||||
|
// "#define <name>(<params>) <definition>"
|
||||||
|
class FunctionMacro<list<string> params_, string definition_> {
|
||||||
|
list<string> params = params_;
|
||||||
|
string definition = definition_;
|
||||||
|
}
|
||||||
|
|
||||||
// Coprocessor immediate
|
// Coprocessor immediate
|
||||||
def imm_coproc : Immediate<sint, IB_ConstRange<0, 7>>;
|
def imm_coproc : Immediate<sint, IB_ConstRange<0, 7>>;
|
||||||
|
|
||||||
|
@ -107,3 +114,77 @@ defm vcx2: CDE_VCXFP_m<(args imm_6b:$imm), (args u32:$n), (args u64:$n),
|
||||||
defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm),
|
defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm),
|
||||||
(args u32:$n, u32:$m), (args u64:$n, u64:$m),
|
(args u32:$n, u32:$m), (args u64:$n, u64:$m),
|
||||||
(? (bitcast $n, FScalar), (bitcast $m, FScalar))>;
|
(? (bitcast $n, FScalar), (bitcast $m, FScalar))>;
|
||||||
|
|
||||||
|
// VCX* instructions operating on Q vector registers
|
||||||
|
|
||||||
|
def v16u8 : VecOf<u8>;
|
||||||
|
|
||||||
|
let pnt = PNT_None, params = [u8] in
|
||||||
|
def vcx1q : CDEIntrinsic<Vector, (args imm_coproc:$cp, imm_12b:$imm),
|
||||||
|
(CDEIRInt<"vcx1q"> $cp, $imm)>;
|
||||||
|
|
||||||
|
let pnt = PNT_Type, params = T.All, polymorphicOnly = 1 in {
|
||||||
|
def vcx1qa :
|
||||||
|
CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$acc, imm_12b:$imm),
|
||||||
|
(bitcast (CDEIRInt<"vcx1qa"> $cp, (bitcast $acc, v16u8), $imm),
|
||||||
|
Vector)>;
|
||||||
|
|
||||||
|
def vcx2q :
|
||||||
|
CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
|
||||||
|
(bitcast (CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm),
|
||||||
|
Vector)>;
|
||||||
|
def vcx2q_u8 :
|
||||||
|
CDEIntrinsic<v16u8, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
|
||||||
|
(CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm)>;
|
||||||
|
|
||||||
|
def vcx2qa_impl :
|
||||||
|
CDEIntrinsic<Vector,
|
||||||
|
(args imm_coproc:$cp, Vector:$acc, v16u8:$n, imm_7b:$imm),
|
||||||
|
(bitcast (CDEIRInt<"vcx2qa"> $cp, (bitcast $acc, v16u8), $n, $imm),
|
||||||
|
Vector)>;
|
||||||
|
|
||||||
|
def vcx3q_impl :
|
||||||
|
CDEIntrinsic<Vector,
|
||||||
|
(args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
|
||||||
|
(bitcast (CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm),
|
||||||
|
Vector)>;
|
||||||
|
def vcx3q_u8_impl :
|
||||||
|
CDEIntrinsic<v16u8,
|
||||||
|
(args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
|
||||||
|
(CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm)>;
|
||||||
|
def vcx3qa_impl :
|
||||||
|
CDEIntrinsic<Vector,
|
||||||
|
(args imm_coproc:$cp, Vector:$acc, v16u8:$n, v16u8:$m, imm_4b:$imm),
|
||||||
|
(bitcast (CDEIRInt<"vcx3qa"> $cp, (bitcast $acc, v16u8), $n, $m,
|
||||||
|
$imm),
|
||||||
|
Vector)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reinterpret intrinsics required to implement __arm_vcx*q with 2 or 3
|
||||||
|
// polymorphic paramters.
|
||||||
|
let params = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32],
|
||||||
|
headerOnly = 1, polymorphicOnly = 1 in
|
||||||
|
def vreinterpretq_u8 :
|
||||||
|
Intrinsic<v16u8, (args Vector:$x), (vreinterpret $x, v16u8)>;
|
||||||
|
|
||||||
|
// We need vreinterpretq_u8_u8 to avoid doing smart tricks in the macros
|
||||||
|
let params = [u8], polymorphicOnly = 1 in
|
||||||
|
def vreinterpretq_u8_cde :
|
||||||
|
CDEIntrinsic<v16u8, (args Vector:$x), (id $x)>,
|
||||||
|
NameOverride<"vreinterpretq_u8">;
|
||||||
|
|
||||||
|
|
||||||
|
def vcx2qa : FunctionMacro<
|
||||||
|
["cp", "acc", "n", "imm"],
|
||||||
|
"__arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))">;
|
||||||
|
|
||||||
|
def vcx3q : FunctionMacro<
|
||||||
|
["cp", "n", "m", "imm"],
|
||||||
|
"__arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
|
||||||
|
def vcx3q_u8 : FunctionMacro<
|
||||||
|
["cp", "n", "m", "imm"],
|
||||||
|
"__arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
|
||||||
|
def vcx3qa : FunctionMacro<
|
||||||
|
["cp", "acc", "n", "m", "imm"],
|
||||||
|
"__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), "
|
||||||
|
"__arm_vreinterpretq_u8(m), (imm))">;
|
||||||
|
|
|
@ -0,0 +1,104 @@
|
||||||
|
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||||
|
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \
|
||||||
|
// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \
|
||||||
|
// RUN: -target-feature +mve.fp \
|
||||||
|
// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \
|
||||||
|
// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
|
||||||
|
|
||||||
|
#include <arm_cde.h>
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx1q_u8(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
|
||||||
|
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
|
||||||
|
//
|
||||||
|
uint8x16_t test_vcx1q_u8(void) {
|
||||||
|
return __arm_vcx1q_u8(0, 1111);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx1qa_1(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> [[ACC:%.*]], i32 1112)
|
||||||
|
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
|
||||||
|
//
|
||||||
|
uint8x16_t test_vcx1qa_1(uint8x16_t acc) {
|
||||||
|
return __arm_vcx1qa(1, acc, 1112);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx1qa_2(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[ACC:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> [[TMP0]], i32 1113)
|
||||||
|
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
|
||||||
|
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
|
||||||
|
//
|
||||||
|
int32x4_t test_vcx1qa_2(int32x4_t acc) {
|
||||||
|
return __arm_vcx1qa(0, acc, 1113);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx2q_u8(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 111)
|
||||||
|
// CHECK-NEXT: ret <16 x i8> [[TMP1]]
|
||||||
|
//
|
||||||
|
uint8x16_t test_vcx2q_u8(float16x8_t n) {
|
||||||
|
return __arm_vcx2q_u8(1, n, 111);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx2q(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 112)
|
||||||
|
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
|
||||||
|
// CHECK-NEXT: ret <4 x float> [[TMP2]]
|
||||||
|
//
|
||||||
|
float32x4_t test_vcx2q(float32x4_t n) {
|
||||||
|
return __arm_vcx2q(1, n, 112);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx2qa(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 113)
|
||||||
|
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
|
||||||
|
// CHECK-NEXT: ret <4 x float> [[TMP3]]
|
||||||
|
//
|
||||||
|
float32x4_t test_vcx2qa(float32x4_t acc, int64x2_t n) {
|
||||||
|
return __arm_vcx2qa(0, acc, n, 113);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx3q_u8(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 11)
|
||||||
|
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
|
||||||
|
//
|
||||||
|
uint8x16_t test_vcx3q_u8(uint16x8_t n, int32x4_t m) {
|
||||||
|
return __arm_vcx3q_u8(0, n, m, 11);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx3q(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12)
|
||||||
|
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
|
||||||
|
// CHECK-NEXT: ret <2 x i64> [[TMP3]]
|
||||||
|
//
|
||||||
|
uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) {
|
||||||
|
return __arm_vcx3q(1, n, m, 12);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: @test_vcx3qa(
|
||||||
|
// CHECK-NEXT: entry:
|
||||||
|
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
|
||||||
|
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> [[ACC:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 13)
|
||||||
|
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
|
||||||
|
//
|
||||||
|
int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) {
|
||||||
|
return __arm_vcx3qa(1, acc, n, m, 13);
|
||||||
|
}
|
|
@ -103,3 +103,27 @@ void test_vcxfp_u64(uint64_t a, uint64_t n, uint64_t m) {
|
||||||
__arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}}
|
__arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}}
|
||||||
__arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
__arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void test_vcxq(uint32_t a, uint8x16_t acc, float16x8_t n, int64x2_t m) {
|
||||||
|
(void)__arm_vcx1q_u8(0, 0);
|
||||||
|
__arm_vcx1q_u8(0, a); // expected-error {{argument to '__arm_vcx1q_u8' must be a constant integer}}
|
||||||
|
__arm_vcx1q_u8(0, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
|
||||||
|
__arm_vcx1qa(0, acc, a); // expected-error {{argument to '__arm_vcx1qa' must be a constant integer}}
|
||||||
|
__arm_vcx1qa(0, acc, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
|
||||||
|
|
||||||
|
(void)__arm_vcx2q_u8(0, n, 0);
|
||||||
|
__arm_vcx2q_u8(0, n, a); // expected-error {{argument to '__arm_vcx2q_u8' must be a constant integer}}
|
||||||
|
__arm_vcx2q_u8(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
|
||||||
|
__arm_vcx2q(0, n, a); // expected-error {{argument to '__arm_vcx2q' must be a constant integer}}
|
||||||
|
__arm_vcx2q(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
|
||||||
|
__arm_vcx2qa(0, n, acc, a); // expected-error {{argument to '__arm_vcx2qa_impl' must be a constant integer}}
|
||||||
|
__arm_vcx2qa(0, n, acc, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
|
||||||
|
|
||||||
|
(void)__arm_vcx3q_u8(0, n, m, 0);
|
||||||
|
__arm_vcx3q_u8(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_u8_impl' must be a constant integer}}
|
||||||
|
__arm_vcx3q_u8(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
|
||||||
|
__arm_vcx3q(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_impl' must be a constant integer}}
|
||||||
|
__arm_vcx3q(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
|
||||||
|
__arm_vcx3qa(0, n, m, acc, a); // expected-error {{argument to '__arm_vcx3qa_impl' must be a constant integer}}
|
||||||
|
__arm_vcx3qa(0, n, m, acc, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
|
||||||
|
}
|
||||||
|
|
|
@ -1962,18 +1962,48 @@ void MveEmitter::EmitBuiltinSema(raw_ostream &OS) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
// Class that describes an ACLE intrinsic implemented as a macro.
|
||||||
|
//
|
||||||
|
// This class is used when the intrinsic is polymorphic in 2 or 3 types, but we
|
||||||
|
// want to avoid a combinatorial explosion by reinterpreting the arguments to
|
||||||
|
// fixed types.
|
||||||
|
|
||||||
|
class FunctionMacro {
|
||||||
|
std::vector<StringRef> Params;
|
||||||
|
StringRef Definition;
|
||||||
|
|
||||||
|
public:
|
||||||
|
FunctionMacro(const Record &R);
|
||||||
|
|
||||||
|
const std::vector<StringRef> &getParams() const { return Params; }
|
||||||
|
StringRef getDefinition() const { return Definition; }
|
||||||
|
};
|
||||||
|
|
||||||
|
FunctionMacro::FunctionMacro(const Record &R) {
|
||||||
|
Params = R.getValueAsListOfStrings("params");
|
||||||
|
Definition = R.getValueAsString("definition");
|
||||||
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
// The class used for generating arm_cde.h and related Clang bits
|
// The class used for generating arm_cde.h and related Clang bits
|
||||||
//
|
//
|
||||||
|
|
||||||
class CdeEmitter : public EmitterBase {
|
class CdeEmitter : public EmitterBase {
|
||||||
|
std::map<StringRef, FunctionMacro> FunctionMacros;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CdeEmitter(RecordKeeper &Records) : EmitterBase(Records){};
|
CdeEmitter(RecordKeeper &Records);
|
||||||
void EmitHeader(raw_ostream &OS) override;
|
void EmitHeader(raw_ostream &OS) override;
|
||||||
void EmitBuiltinDef(raw_ostream &OS) override;
|
void EmitBuiltinDef(raw_ostream &OS) override;
|
||||||
void EmitBuiltinSema(raw_ostream &OS) override;
|
void EmitBuiltinSema(raw_ostream &OS) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) {
|
||||||
|
for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
|
||||||
|
FunctionMacros.emplace(R->getName(), FunctionMacro(*R));
|
||||||
|
}
|
||||||
|
|
||||||
void CdeEmitter::EmitHeader(raw_ostream &OS) {
|
void CdeEmitter::EmitHeader(raw_ostream &OS) {
|
||||||
// Accumulate pieces of the header file that will be enabled under various
|
// Accumulate pieces of the header file that will be enabled under various
|
||||||
// different combinations of #ifdef. The index into parts[] is one of the
|
// different combinations of #ifdef. The index into parts[] is one of the
|
||||||
|
@ -2051,6 +2081,16 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (const auto &kv : FunctionMacros) {
|
||||||
|
StringRef Name = kv.first;
|
||||||
|
const FunctionMacro &FM = kv.second;
|
||||||
|
|
||||||
|
raw_ostream &OS = parts[MVE];
|
||||||
|
OS << "#define "
|
||||||
|
<< "__arm_" << Name << "(" << join(FM.getParams(), ", ") << ") "
|
||||||
|
<< FM.getDefinition() << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
for (auto &part : parts)
|
for (auto &part : parts)
|
||||||
part << "\n";
|
part << "\n";
|
||||||
|
|
||||||
|
|
|
@ -1317,4 +1317,20 @@ defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>;
|
||||||
defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
|
defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
|
||||||
defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;
|
defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;
|
||||||
|
|
||||||
|
multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
|
||||||
|
def "" : Intrinsic<
|
||||||
|
[llvm_v16i8_ty],
|
||||||
|
!listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
|
||||||
|
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
|
||||||
|
def a : Intrinsic<
|
||||||
|
[llvm_v16i8_ty],
|
||||||
|
!listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
|
||||||
|
args, [llvm_i32_ty /* imm */]),
|
||||||
|
[IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
|
||||||
|
}
|
||||||
|
|
||||||
|
defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;
|
||||||
|
defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>;
|
||||||
|
defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>;
|
||||||
|
|
||||||
} // end TargetPrefix
|
} // end TargetPrefix
|
||||||
|
|
|
@ -581,3 +581,28 @@ let Predicates = [HasCDE, HasFPRegs] in {
|
||||||
(f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
|
(f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
|
||||||
imm_3b:$imm))>;
|
imm_3b:$imm))>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let Predicates = [HasCDE, HasMVEInt] in {
|
||||||
|
def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
|
||||||
|
(v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
|
||||||
|
def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
|
||||||
|
timm:$imm)),
|
||||||
|
(v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
|
||||||
|
|
||||||
|
def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
|
||||||
|
(v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
|
||||||
|
def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
|
||||||
|
(v16i8 MQPR:$n), timm:$imm)),
|
||||||
|
(v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
|
||||||
|
imm_7b:$imm))>;
|
||||||
|
|
||||||
|
def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
|
||||||
|
(v16i8 MQPR:$m), timm:$imm)),
|
||||||
|
(v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
|
||||||
|
imm_4b:$imm))>;
|
||||||
|
def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
|
||||||
|
(v16i8 MQPR:$n), (v16i8 MQPR:$m),
|
||||||
|
timm:$imm)),
|
||||||
|
(v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
|
||||||
|
imm_4b:$imm))>;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
|
||||||
|
declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
|
||||||
|
declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg)
|
||||||
|
declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
|
||||||
|
declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
|
||||||
|
declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg)
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() {
|
||||||
|
; CHECK-LABEL: test_vcx1q_u8:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx1 p0, q0, #1111
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
|
||||||
|
ret <16 x i8> %0
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) {
|
||||||
|
; CHECK-LABEL: test_vcx1qa_1:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx1a p1, q0, #1112
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112)
|
||||||
|
ret <16 x i8> %0
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) {
|
||||||
|
; CHECK-LABEL: test_vcx1qa_2:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx1a p0, q0, #1113
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <4 x i32> %acc to <16 x i8>
|
||||||
|
%1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113)
|
||||||
|
%2 = bitcast <16 x i8> %1 to <4 x i32>
|
||||||
|
ret <4 x i32> %2
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) {
|
||||||
|
; CHECK-LABEL: test_vcx2q_u8:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx2 p1, q0, q0, #111
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x half> %n to <16 x i8>
|
||||||
|
%1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111)
|
||||||
|
ret <16 x i8> %1
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) {
|
||||||
|
; CHECK-LABEL: test_vcx2q:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx2 p1, q0, q0, #112
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <4 x float> %n to <16 x i8>
|
||||||
|
%1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112)
|
||||||
|
%2 = bitcast <16 x i8> %1 to <4 x float>
|
||||||
|
ret <4 x float> %2
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) {
|
||||||
|
; CHECK-LABEL: test_vcx2qa:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx2a p0, q0, q1, #113
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <4 x float> %acc to <16 x i8>
|
||||||
|
%1 = bitcast <2 x i64> %n to <16 x i8>
|
||||||
|
%2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113)
|
||||||
|
%3 = bitcast <16 x i8> %2 to <4 x float>
|
||||||
|
ret <4 x float> %3
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) {
|
||||||
|
; CHECK-LABEL: test_vcx3q_u8:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx3 p0, q0, q0, q1, #11
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x i16> %n to <16 x i8>
|
||||||
|
%1 = bitcast <4 x i32> %m to <16 x i8>
|
||||||
|
%2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11)
|
||||||
|
ret <16 x i8> %2
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) {
|
||||||
|
; CHECK-LABEL: test_vcx3q:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx3 p1, q0, q0, q1, #12
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <2 x i64> %n to <16 x i8>
|
||||||
|
%1 = bitcast <4 x float> %m to <16 x i8>
|
||||||
|
%2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12)
|
||||||
|
%3 = bitcast <16 x i8> %2 to <2 x i64>
|
||||||
|
ret <2 x i64> %3
|
||||||
|
}
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) {
|
||||||
|
; CHECK-LABEL: test_vcx3qa:
|
||||||
|
; CHECK: @ %bb.0: @ %entry
|
||||||
|
; CHECK-NEXT: vcx3a p1, q0, q1, q2, #13
|
||||||
|
; CHECK-NEXT: bx lr
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x i16> %n to <16 x i8>
|
||||||
|
%1 = bitcast <4 x float> %m to <16 x i8>
|
||||||
|
%2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
|
||||||
|
ret <16 x i8> %2
|
||||||
|
}
|
Loading…
Reference in New Issue