From d22e66171251cd3dd07507912189aa814a419678 Mon Sep 17 00:00:00 2001 From: Mikhail Maltsev Date: Fri, 20 Mar 2020 14:01:53 +0000 Subject: [PATCH] [ARM,CDE] Implement CDE S and D-register intrinsics Summary: This patch implements the following ACLE intrinsics: uint32_t __arm_vcx1_u32(int coproc, uint32_t imm); uint32_t __arm_vcx1a_u32(int coproc, uint32_t acc, uint32_t imm); uint32_t __arm_vcx2_u32(int coproc, uint32_t n, uint32_t imm); uint32_t __arm_vcx2a_u32(int coproc, uint32_t acc, uint32_t n, uint32_t imm); uint32_t __arm_vcx3_u32(int coproc, uint32_t n, uint32_t m, uint32_t imm); uint32_t __arm_vcx3a_u32(int coproc, uint32_t acc, uint32_t n, uint32_t m, uint32_t imm); uint64_t __arm_vcx1d_u64(int coproc, uint32_t imm); uint64_t __arm_vcx1da_u64(int coproc, uint64_t acc, uint32_t imm); uint64_t __arm_vcx2d_u64(int coproc, uint64_t m, uint32_t imm); uint64_t __arm_vcx2da_u64(int coproc, uint64_t acc, uint64_t m, uint32_t imm); uint64_t __arm_vcx3d_u64(int coproc, uint64_t n, uint64_t m, uint32_t imm); uint64_t __arm_vcx3da_u64(int coproc, uint64_t acc, uint64_t n, uint64_t m, uint32_t imm); Since the semantics of CDE instructions is opaque to the compiler, the ACLE intrinsics require dedicated LLVM IR intrinsics. The 64-bit and 32-bit variants share the same IR intrinsic. Reviewers: simon_tatham, MarkMurrayARM, ostannard, dmgreen Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76298 --- clang/include/clang/Basic/arm_cde.td | 37 +++++ clang/test/CodeGen/arm-cde-vfp.c | 145 +++++++++++++++++++ clang/test/Sema/arm-cde-immediates.c | 40 ++++++ clang/utils/TableGen/MveEmitter.cpp | 3 + llvm/include/llvm/IR/IntrinsicsARM.td | 16 +++ llvm/lib/Target/ARM/ARMInstrCDE.td | 39 +++++ llvm/test/CodeGen/Thumb2/cde-vfp.ll | 198 ++++++++++++++++++++++++++ 7 files changed, 478 insertions(+) create mode 100644 clang/test/CodeGen/arm-cde-vfp.c create mode 100644 llvm/test/CodeGen/Thumb2/cde-vfp.ll diff --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td index 139007d387a0..9cd0af8987c9 100644 --- a/clang/include/clang/Basic/arm_cde.td +++ b/clang/include/clang/Basic/arm_cde.td @@ -13,6 +13,15 @@ include "arm_mve_defs.td" +// f64 is not defined in arm_mve_defs.td because MVE instructions only work with +// f16 and f32 +def f64: PrimitiveType<"f", 64>; + +// Float expects t to be a scalar type, and expands to the floating-point +// type of the same width. +class Float: ComplexType<(CTO_CopyKind t, f32)>; +def FScalar: Float; + // ACLE CDE intrinsic class CDEIntrinsic : Intrinsic { @@ -70,3 +79,31 @@ multiclass CDE_CX_m { defm cx1 : CDE_CX_m<(args imm_13b:$imm), (args), (?)>; defm cx2 : CDE_CX_m<(args imm_9b:$imm), (args u32:$n), (? $n)>; defm cx3 : CDE_CX_m<(args imm_6b:$imm), (args u32:$n, u32:$m), (? $n, $m)>; + +// VCX* instructions operating on VFP registers +multiclass CDE_VCXFP_m { + defvar cp = (args imm_coproc:$cp); + let pnt = PNT_None, params = [u32] in { + def "" : CDEIntrinsic $cp), cgArgs, (? $imm)), + Scalar)>; + def a : CDEIntrinsic $cp, + (bitcast $acc, FScalar)), cgArgs, (? $imm)), Scalar)>; + } + let pnt = PNT_None, params = [u64] in { + def d : CDEIntrinsic $cp), cgArgs, (? $imm)), + Scalar)>; + def da : CDEIntrinsic $cp, + (bitcast $acc, FScalar)), cgArgs, (? $imm)), Scalar)>; + } +} + +defm vcx1: CDE_VCXFP_m<(args imm_11b:$imm), (args), (args), (?)>; +defm vcx2: CDE_VCXFP_m<(args imm_6b:$imm), (args u32:$n), (args u64:$n), + (? (bitcast $n, FScalar))>; +defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm), + (args u32:$n, u32:$m), (args u64:$n, u64:$m), + (? (bitcast $n, FScalar), (bitcast $m, FScalar))>; diff --git a/clang/test/CodeGen/arm-cde-vfp.c b/clang/test/CodeGen/arm-cde-vfp.c new file mode 100644 index 000000000000..fffcb716359d --- /dev/null +++ b/clang/test/CodeGen/arm-cde-vfp.c @@ -0,0 +1,145 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \ +// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \ +// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \ +// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vcx1_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.arm.cde.vcx1.f32(i32 0, i32 11) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vcx1_u32(void) { + return __arm_vcx1_u32(0, 11); +} + +// CHECK-LABEL: @test_vcx1a_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.arm.cde.vcx1a.f32(i32 1, float [[TMP0]], i32 12) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vcx1a_u32(uint32_t acc) { + return __arm_vcx1a_u32(1, acc, 12); +} + +// CHECK-LABEL: @test_vcx2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.arm.cde.vcx2.f32(i32 0, float [[TMP0]], i32 21) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vcx2_u32(uint32_t n) { + return __arm_vcx2_u32(0, n, 21); +} + +// CHECK-LABEL: @test_vcx2a_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.cde.vcx2a.f32(i32 0, float [[TMP0]], float [[TMP1]], i32 22) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vcx2a_u32(uint32_t acc, uint32_t n) { + return __arm_vcx2a_u32(0, acc, n, 22); +} + +// CHECK-LABEL: @test_vcx3_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[M:%.*]] to float +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.cde.vcx3.f32(i32 1, float [[TMP0]], float [[TMP1]], i32 3) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vcx3_u32(uint32_t n, uint32_t m) { + return __arm_vcx3_u32(1, n, m, 3); +} + +// CHECK-LABEL: @test_vcx3a_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[M:%.*]] to float +// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.arm.cde.vcx3a.f32(i32 0, float [[TMP0]], float [[TMP1]], float [[TMP2]], i32 5) +// CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32 +// CHECK-NEXT: ret i32 [[TMP4]] +// +uint32_t test_vcx3a_u32(uint32_t acc, uint32_t n, uint32_t m) { + return __arm_vcx3a_u32(0, acc, n, m, 5); +} + +// CHECK-LABEL: @test_vcx1d_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.arm.cde.vcx1.f64(i32 0, i32 11) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[TMP1]] +// +uint64_t test_vcx1d_u64(void) { + return __arm_vcx1d_u64(0, 11); +} + +// CHECK-LABEL: @test_vcx1da_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.arm.cde.vcx1a.f64(i32 1, double [[TMP0]], i32 12) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[TMP2]] +// +uint64_t test_vcx1da_u64(uint64_t acc) { + return __arm_vcx1da_u64(1, acc, 12); +} + +// CHECK-LABEL: @test_vcx2d_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.arm.cde.vcx2.f64(i32 0, double [[TMP0]], i32 21) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[TMP2]] +// +uint64_t test_vcx2d_u64(uint64_t n) { + return __arm_vcx2d_u64(0, n, 21); +} + +// CHECK-LABEL: @test_vcx2da_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.arm.cde.vcx2a.f64(i32 0, double [[TMP0]], double [[TMP1]], i32 22) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +// CHECK-NEXT: ret i64 [[TMP3]] +// +uint64_t test_vcx2da_u64(uint64_t acc, uint64_t n) { + return __arm_vcx2da_u64(0, acc, n, 22); +} + +// CHECK-LABEL: @test_vcx3d_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[M:%.*]] to double +// CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.arm.cde.vcx3.f64(i32 1, double [[TMP0]], double [[TMP1]], i32 3) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +// CHECK-NEXT: ret i64 [[TMP3]] +// +uint64_t test_vcx3d_u64(uint64_t n, uint64_t m) { + return __arm_vcx3d_u64(1, n, m, 3); +} + +// CHECK-LABEL: @test_vcx3da_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[M:%.*]] to double +// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.arm.cde.vcx3a.f64(i32 0, double [[TMP0]], double [[TMP1]], double [[TMP2]], i32 5) +// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64 +// CHECK-NEXT: ret i64 [[TMP4]] +// +uint64_t test_vcx3da_u64(uint64_t acc, uint64_t n, uint64_t m) { + return __arm_vcx3da_u64(0, acc, n, m, 5); +} diff --git a/clang/test/Sema/arm-cde-immediates.c b/clang/test/Sema/arm-cde-immediates.c index d521e099c7d1..19159f9be4ea 100644 --- a/clang/test/Sema/arm-cde-immediates.c +++ b/clang/test/Sema/arm-cde-immediates.c @@ -63,3 +63,43 @@ void test_cx(uint32_t a, uint64_t da, uint32_t n, uint32_t m) { __arm_cx3da(0, da, n, m, a); // expected-error {{argument to '__arm_cx3da' must be a constant integer}} __arm_cx3da(0, da, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} } + +void test_vcxfp_u32(uint32_t a, uint32_t n, uint32_t m) { + (void)__arm_vcx1_u32(0, 0); + __arm_vcx1_u32(0, a); // expected-error {{argument to '__arm_vcx1_u32' must be a constant integer}} + __arm_vcx1_u32(0, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + __arm_vcx1a_u32(0, a, a); // expected-error {{argument to '__arm_vcx1a_u32' must be a constant integer}} + __arm_vcx1a_u32(0, a, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + + (void)__arm_vcx2_u32(0, n, 0); + __arm_vcx2_u32(0, n, a); // expected-error {{argument to '__arm_vcx2_u32' must be a constant integer}} + __arm_vcx2_u32(0, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_vcx2a_u32(0, a, n, a); // expected-error {{argument to '__arm_vcx2a_u32' must be a constant integer}} + __arm_vcx2a_u32(0, a, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + + (void)__arm_vcx3_u32(0, n, m, 0); + __arm_vcx3_u32(0, n, m, a); // expected-error {{argument to '__arm_vcx3_u32' must be a constant integer}} + __arm_vcx3_u32(0, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + __arm_vcx3a_u32(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3a_u32' must be a constant integer}} + __arm_vcx3a_u32(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_vcxfp_u64(uint64_t a, uint64_t n, uint64_t m) { + (void)__arm_vcx1d_u64(0, 0); + __arm_vcx1d_u64(0, a); // expected-error {{argument to '__arm_vcx1d_u64' must be a constant integer}} + __arm_vcx1d_u64(0, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + __arm_vcx1da_u64(0, a, a); // expected-error {{argument to '__arm_vcx1da_u64' must be a constant integer}} + __arm_vcx1da_u64(0, a, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + + (void)__arm_vcx2d_u64(0, n, 0); + __arm_vcx2d_u64(0, n, a); // expected-error {{argument to '__arm_vcx2d_u64' must be a constant integer}} + __arm_vcx2d_u64(0, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_vcx2da_u64(0, a, n, a); // expected-error {{argument to '__arm_vcx2da_u64' must be a constant integer}} + __arm_vcx2da_u64(0, a, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + + (void)__arm_vcx3d_u64(0, n, m, 0); + __arm_vcx3d_u64(0, n, m, a); // expected-error {{argument to '__arm_vcx3d_u64' must be a constant integer}} + __arm_vcx3d_u64(0, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + __arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}} + __arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index f75f5000f0f6..076b491ff94f 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1995,6 +1995,9 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) { const ScalarType *ST = kv.second.get(); if (ST->hasNonstandardName()) continue; + // We don't have float64x2_t + if (ST->kind() == ScalarTypeKind::Float && ST->sizeInBits() == 64) + continue; raw_ostream &OS = parts[ST->requiresFloat() ? MVEFloat : MVE]; const VectorType *VT = getVectorType(ST); diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index ba0cf909e5de..de2e6a39abeb 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1301,4 +1301,20 @@ defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>; defm int_arm_cde_cx2: CDEGPRIntrinsics<[llvm_i32_ty]>; defm int_arm_cde_cx3: CDEGPRIntrinsics<[llvm_i32_ty, llvm_i32_ty]>; +multiclass CDEVCXIntrinsics args> { + def "" : Intrinsic< + [llvm_anyfloat_ty], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a : Intrinsic< + [llvm_anyfloat_ty], + !listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* acc */], + args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; +} + +defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>; +defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>; +defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>; + } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td index 648911acd26c..9497e1733689 100644 --- a/llvm/lib/Target/ARM/ARMInstrCDE.td +++ b/llvm/lib/Target/ARM/ARMInstrCDE.td @@ -542,3 +542,42 @@ def CDE_VCX3_fpdp : CDE_VCX3_FP_Instr_D<"vcx3", cde_vcx_params_d_noacc>; def CDE_VCX3A_fpdp : CDE_VCX3_FP_Instr_D<"vcx3a", cde_vcx_params_d_acc>; def CDE_VCX3_vec : CDE_VCX3_Vec_Instr<"vcx3", cde_vcx_params_q_noacc>; def CDE_VCX3A_vec : CDE_VCX3_Vec_Instr<"vcx3a", cde_vcx_params_q_acc>; + + +let Predicates = [HasCDE, HasFPRegs] in { + def : Pat<(f32 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)), + (f32 (CDE_VCX1_fpsp p_imm:$coproc, imm_11b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx1a timm:$coproc, (f32 SPR:$acc), timm:$imm)), + (f32 (CDE_VCX1A_fpsp p_imm:$coproc, SPR:$acc, imm_11b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)), + (f64 (CDE_VCX1_fpdp p_imm:$coproc, imm_11b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx1a timm:$coproc, (f64 DPR:$acc), timm:$imm)), + (f64 (CDE_VCX1A_fpdp p_imm:$coproc, DPR:$acc, imm_11b:$imm))>; + + def : Pat<(f32 (int_arm_cde_vcx2 timm:$coproc, (f32 SPR:$n), timm:$imm)), + (f32 (CDE_VCX2_fpsp p_imm:$coproc, SPR:$n, imm_6b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx2a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n), + timm:$imm)), + (f32 (CDE_VCX2A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, imm_6b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx2 timm:$coproc, (f64 DPR:$n), timm:$imm)), + (f64 (CDE_VCX2_fpdp p_imm:$coproc, DPR:$n, imm_6b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx2a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n), + timm:$imm)), + (f64 (CDE_VCX2A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, imm_6b:$imm))>; + + def : Pat<(f32 (int_arm_cde_vcx3 timm:$coproc, (f32 SPR:$n), (f32 SPR:$m), + timm:$imm)), + (f32 (CDE_VCX3_fpsp p_imm:$coproc, (f32 SPR:$n), (f32 SPR:$m), + imm_3b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx3a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n), + (f32 SPR:$m), timm:$imm)), + (f32 (CDE_VCX3A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, SPR:$m, + imm_3b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx3 timm:$coproc, (f64 DPR:$n), (f64 DPR:$m), + timm:$imm)), + (f64 (CDE_VCX3_fpdp p_imm:$coproc, DPR:$n, DPR:$m, imm_3b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx3a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n), + (f64 DPR:$m), timm:$imm)), + (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m, + imm_3b:$imm))>; +} diff --git a/llvm/test/CodeGen/Thumb2/cde-vfp.ll b/llvm/test/CodeGen/Thumb2/cde-vfp.ll new file mode 100644 index 000000000000..54ee1d516661 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/cde-vfp.ll @@ -0,0 +1,198 @@ +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbv8m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+fp-armv8d16sp -verify-machineinstrs -o - %s | FileCheck %s + +declare float @llvm.arm.cde.vcx1.f32(i32 immarg, i32 immarg) +declare float @llvm.arm.cde.vcx1a.f32(i32 immarg, float, i32 immarg) +declare float @llvm.arm.cde.vcx2.f32(i32 immarg, float, i32 immarg) +declare float @llvm.arm.cde.vcx2a.f32(i32 immarg, float, float, i32 immarg) +declare float @llvm.arm.cde.vcx3.f32(i32 immarg, float, float, i32 immarg) +declare float @llvm.arm.cde.vcx3a.f32(i32 immarg, float, float, float, i32 immarg) + +declare double @llvm.arm.cde.vcx1.f64(i32 immarg, i32 immarg) +declare double @llvm.arm.cde.vcx1a.f64(i32 immarg, double, i32 immarg) +declare double @llvm.arm.cde.vcx2.f64(i32 immarg, double, i32 immarg) +declare double @llvm.arm.cde.vcx2a.f64(i32 immarg, double, double, i32 immarg) +declare double @llvm.arm.cde.vcx3.f64(i32 immarg, double, double, i32 immarg) +declare double @llvm.arm.cde.vcx3a.f64(i32 immarg, double, double, double, i32 immarg) + +define arm_aapcs_vfpcc i32 @test_vcx1_u32() { +; CHECK-LABEL: test_vcx1_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1 p0, s0, #11 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = call float @llvm.arm.cde.vcx1.f32(i32 0, i32 11) + %1 = bitcast float %0 to i32 + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vcx1a_u32(i32 %acc) { +; CHECK-LABEL: test_vcx1a_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcx1a p1, s0, #12 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %acc to float + %1 = call float @llvm.arm.cde.vcx1a.f32(i32 1, float %0, i32 12) + %2 = bitcast float %1 to i32 + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vcx2_u32(i32 %n) { +; CHECK-LABEL: test_vcx2_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcx2 p0, s0, s0, #21 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %n to float + %1 = call float @llvm.arm.cde.vcx2.f32(i32 0, float %0, i32 21) + %2 = bitcast float %1 to i32 + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vcx2a_u32(i32 %acc, i32 %n) { +; CHECK-LABEL: test_vcx2a_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcx2a p0, s2, s0, #22 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %acc to float + %1 = bitcast i32 %n to float + %2 = call float @llvm.arm.cde.vcx2a.f32(i32 0, float %0, float %1, i32 22) + %3 = bitcast float %2 to i32 + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vcx3_u32(i32 %n, i32 %m) { +; CHECK-LABEL: test_vcx3_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcx3 p1, s0, s2, s0, #3 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %n to float + %1 = bitcast i32 %m to float + %2 = call float @llvm.arm.cde.vcx3.f32(i32 1, float %0, float %1, i32 3) + %3 = bitcast float %2 to i32 + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vcx3a_u32(i32 %acc, i32 %n, i32 %m) { +; CHECK-LABEL: test_vcx3a_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: vmov s2, r1 +; CHECK-NEXT: vmov s4, r0 +; CHECK-NEXT: vcx3a p0, s4, s2, s0, #5 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %acc to float + %1 = bitcast i32 %n to float + %2 = bitcast i32 %m to float + %3 = call float @llvm.arm.cde.vcx3a.f32(i32 0, float %0, float %1, float %2, i32 5) + %4 = bitcast float %3 to i32 + ret i32 %4 +} + +define arm_aapcs_vfpcc i64 @test_vcx1d_u64() { +; CHECK-LABEL: test_vcx1d_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1 p0, d0, #11 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = call double @llvm.arm.cde.vcx1.f64(i32 0, i32 11) + %1 = bitcast double %0 to i64 + ret i64 %1 +} + +define arm_aapcs_vfpcc i64 @test_vcx1da_u64(i64 %acc) { +; CHECK-LABEL: test_vcx1da_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcx1a p1, d0, #12 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %acc to double + %1 = call double @llvm.arm.cde.vcx1a.f64(i32 1, double %0, i32 12) + %2 = bitcast double %1 to i64 + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vcx2d_u64(i64 %n) { +; CHECK-LABEL: test_vcx2d_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcx2 p0, d0, d0, #21 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %n to double + %1 = call double @llvm.arm.cde.vcx2.f64(i32 0, double %0, i32 21) + %2 = bitcast double %1 to i64 + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vcx2da_u64(i64 %acc, i64 %n) { +; CHECK-LABEL: test_vcx2da_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vcx2a p0, d1, d0, #22 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %acc to double + %1 = bitcast i64 %n to double + %2 = call double @llvm.arm.cde.vcx2a.f64(i32 0, double %0, double %1, i32 22) + %3 = bitcast double %2 to i64 + ret i64 %3 +} + +define arm_aapcs_vfpcc i64 @test_vcx3d_u64(i64 %n, i64 %m) { +; CHECK-LABEL: test_vcx3d_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vcx3 p1, d0, d1, d0, #3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %n to double + %1 = bitcast i64 %m to double + %2 = call double @llvm.arm.cde.vcx3.f64(i32 1, double %0, double %1, i32 3) + %3 = bitcast double %2 to i64 + ret i64 %3 +} + +define arm_aapcs_vfpcc i64 @test_vcx3da_u64(i64 %acc, i64 %n, i64 %m) { +; CHECK-LABEL: test_vcx3da_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #8] +; CHECK-DAG: vmov [[D0:d.*]], r0, r1 +; CHECK-DAG: vmov [[D1:d.*]], r2, r3 +; CHECK-DAG: vmov [[D2:d.*]], lr, r12 +; CHECK-NEXT: vcx3a p0, [[D0]], [[D1]], [[D2]], #5 +; CHECK-NEXT: vmov r0, r1, [[D0]] +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = bitcast i64 %acc to double + %1 = bitcast i64 %n to double + %2 = bitcast i64 %m to double + %3 = call double @llvm.arm.cde.vcx3a.f64(i32 0, double %0, double %1, double %2, i32 5) + %4 = bitcast double %3 to i64 + ret i64 %4 +}