From 1b45297e013e1edae5028d844ca7cb591c79b07d Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 7 Oct 2019 17:00:51 +0100 Subject: [PATCH] [ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158 --- llvm/include/llvm/IR/IntrinsicsARM.td | 30 +++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 4 + llvm/lib/Target/ARM/ARMInstrMVE.td | 212 +++++++++++++----- .../CodeGen/Thumb2/mve-intrinsics/vaddq.ll | 112 +++++++++ .../CodeGen/Thumb2/mve-intrinsics/vcvt.ll | 56 +++++ .../CodeGen/Thumb2/mve-intrinsics/vminvq.ll | 36 +++ 6 files changed, 391 insertions(+), 59 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index e13da6157e04..500ee8109f23 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -787,4 +787,34 @@ def int_arm_gnu_eabi_mcount : Intrinsic<[], [], [IntrReadMem, IntrWriteMem]>; +def int_arm_mve_pred_i2v : Intrinsic< + [llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_arm_mve_pred_v2i : Intrinsic< + [llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem]>; + +multiclass IntrinsicSignSuffix rets, list params = [], + list props = [], + string name = "", + list sdprops = []> { + def _s: Intrinsic; + def _u: Intrinsic; +} + +def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [IntrNoMem]>; +def int_arm_mve_sub_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [IntrNoMem]>; + +defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty], + [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; +defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty], + [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; + +def int_arm_mve_vcvt_narrow: Intrinsic<[llvm_v8f16_ty], + [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_arm_mve_vcvt_narrow_predicated: Intrinsic<[llvm_v8f16_ty], + [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem]>; + } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index db26feb57010..615a09e16011 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3698,6 +3698,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_neon_vtbl2: return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::arm_mve_pred_i2v: + case Intrinsic::arm_mve_pred_v2i: + return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), + Op.getOperand(1)); } } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 4f67cd6e47cc..7d49df3d0c07 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -275,6 +275,62 @@ class mve_addr_q_shift : MemOperand { let MIOperandInfo = (ops MQPR:$base, i32imm:$imm); } +// A family of classes wrapping up information about the vector types +// used by MVE. +class MVEVectorVTInfo size, + string suffix, bit unsigned> { + // The LLVM ValueType representing the vector, so we can use it in + // ISel patterns. + ValueType Vec = vec; + + // An LLVM ValueType representing a corresponding vector of + // predicate bits, for use in ISel patterns that handle an IR + // intrinsic describing the predicated form of the instruction. + // + // Usually, for a vector of N things, this will be vNi1. But for + // vectors of 2 values, we make an exception, and use v4i1 instead + // of v2i1. Rationale: MVE codegen doesn't support doing all the + // auxiliary operations on v2i1 (vector shuffles etc), and also, + // there's no MVE compare instruction that will _generate_ v2i1 + // directly. + ValueType Pred = pred; + + // The most common representation of the vector element size in MVE + // instruction encodings: a 2-bit value V representing an (8< Size = size; + + // For vectors explicitly mentioning a signedness of integers: 0 for + // signed and 1 for unsigned. For anything else, undefined. + bit Unsigned = unsigned; + + // The suffix used on the instruction in assembly language. + string Suffix = suffix; +} + +// Integer vector types that don't treat signed and unsigned differently. +def MVE_v16i8 : MVEVectorVTInfo; +def MVE_v8i16 : MVEVectorVTInfo; +def MVE_v4i32 : MVEVectorVTInfo; +def MVE_v2i64 : MVEVectorVTInfo; + +// Explicitly signed and unsigned integer vectors. They map to the +// same set of LLVM ValueTypes as above, but are represented +// differently in assembly and instruction encodings. +def MVE_v16s8 : MVEVectorVTInfo; +def MVE_v8s16 : MVEVectorVTInfo; +def MVE_v4s32 : MVEVectorVTInfo; +def MVE_v2s64 : MVEVectorVTInfo; +def MVE_v16u8 : MVEVectorVTInfo; +def MVE_v8u16 : MVEVectorVTInfo; +def MVE_v4u32 : MVEVectorVTInfo; +def MVE_v2u64 : MVEVectorVTInfo; + +// FP vector types. +def MVE_v8f16 : MVEVectorVTInfo; +def MVE_v4f32 : MVEVectorVTInfo; +def MVE_v2f64 : MVEVectorVTInfo; + // --------- Start of base classes for the instructions themselves class MVE_MI size, let Inst{0} = 0b0; } -multiclass MVE_VMINMAXV_ty pattern=[]> { - def s8 : MVE_VMINMAXV; - def s16 : MVE_VMINMAXV; - def s32 : MVE_VMINMAXV; - def u8 : MVE_VMINMAXV; - def u16 : MVE_VMINMAXV; - def u32 : MVE_VMINMAXV; +multiclass MVE_VMINMAXV_p { + def "": MVE_VMINMAXV; + + let Predicates = [HasMVEInt] in + def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))), + (i32 (!cast(NAME) + (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; } -defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>; -defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>; +multiclass MVE_VMINMAXV_ty { + defm s8 : MVE_VMINMAXV_p; + defm s16: MVE_VMINMAXV_p; + defm s32: MVE_VMINMAXV_p; + defm u8 : MVE_VMINMAXV_p; + defm u16: MVE_VMINMAXV_p; + defm u32: MVE_VMINMAXV_p; +} + +defm MVE_VMINV : MVE_VMINMAXV_ty< + "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>; +defm MVE_VMAXV : MVE_VMINMAXV_ty< + "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), @@ -1491,36 +1561,38 @@ class MVE_VADDSUB size, bit subtract, let validForTailPredication = 1; } -class MVE_VADD size, list pattern=[]> - : MVE_VADDSUB<"vadd", suffix, size, 0b0, pattern>; -class MVE_VSUB size, list pattern=[]> - : MVE_VADDSUB<"vsub", suffix, size, 0b1, pattern>; +multiclass MVE_VADDSUB_m { + def "" : MVE_VADDSUB; -def MVE_VADDi8 : MVE_VADD<"i8", 0b00>; -def MVE_VADDi16 : MVE_VADD<"i16", 0b01>; -def MVE_VADDi32 : MVE_VADD<"i32", 0b10>; + let Predicates = [HasMVEInt] in { + // Unpredicated add/subtract + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VADDi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VADDi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VADDi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + // Predicated add/subtract + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 1), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } -def MVE_VSUBi8 : MVE_VSUB<"i8", 0b00>; -def MVE_VSUBi16 : MVE_VSUB<"i16", 0b01>; -def MVE_VSUBi32 : MVE_VSUB<"i32", 0b10>; +multiclass MVE_VADD + : MVE_VADDSUB_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>; +multiclass MVE_VSUB + : MVE_VADDSUB_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>; -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VSUBi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VSUBi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VSUBi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; -} +defm MVE_VADDi8 : MVE_VADD; +defm MVE_VADDi16 : MVE_VADD; +defm MVE_VADDi32 : MVE_VADD; + +defm MVE_VSUBi8 : MVE_VSUB; +defm MVE_VSUBi16 : MVE_VSUB; +defm MVE_VSUBi32 : MVE_VSUB; class MVE_VQADDSUB size, ValueType vt> @@ -2763,31 +2835,35 @@ let Predicates = [HasMVEFloat] in { (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; } +multiclass MVE_VADDSUB_fp_m { + def "" : MVE_VADDSUBFMA_fp { + let validForTailPredication = 1; + } -let validForTailPredication = 1 in { - def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; - def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 1), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), - (v4f32 (MVE_VADDf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; - def : Pat<(v8f16 (fadd (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), - (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; -} +multiclass MVE_VADD_fp_m + : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>; +multiclass MVE_VSUB_fp_m + : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>; +defm MVE_VADDf32 : MVE_VADD_fp_m; +defm MVE_VADDf16 : MVE_VADD_fp_m; -let validForTailPredication = 1 in { - def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; - def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; -} - -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), - (v4f32 (MVE_VSUBf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; - def : Pat<(v8f16 (fsub (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), - (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; -} +defm MVE_VSUBf32 : MVE_VSUB_fp_m; +defm MVE_VSUBf16 : MVE_VSUB_fp_m; class MVE_VCADD pattern=[]> : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd), @@ -3551,13 +3627,31 @@ class MVE_VCVT_ff { - def bh : MVE_VCVT_ff<"vcvtb", suffix, op, 0b0>; - def th : MVE_VCVT_ff<"vcvtt", suffix, op, 0b1>; +multiclass MVE_VCVT_f2h_m { + def "": MVE_VCVT_ff; + + let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (int_arm_mve_vcvt_narrow + (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))), + (v8f16 (!cast(NAME) + (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>; + def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated + (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half), + (v4i1 VCCR:$mask))), + (v8f16 (!cast(NAME) + (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), + (i32 1), (v4i1 VCCR:$mask)))>; + } } -defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>; -defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>; +multiclass MVE_VCVT_h2f_m { + def "": MVE_VCVT_ff; +} + +defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>; +defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>; +defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>; +defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>; class MVE_VxCADD size, bit halve, string cstr="", list pattern=[]> diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll new file mode 100644 index 000000000000..a3cb91c21bc8 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = add <4 x i32> %b, %a + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vaddq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f32 q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = fadd <4 x float> %b, %a + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vsubq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsub.f16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = fsub <8 x half> %a, %b + ret <8 x half> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsubq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <8 x i16> %a, %b + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +declare <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) + +define arm_aapcs_vfpcc <8 x half> @test_vaddq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddq_m_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.f16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive) + ret <8 x half> %2 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) + +define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsubq_m_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.f32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive) + ret <4 x float> %2 +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) + +define arm_aapcs_vfpcc <4 x i32> @test_vsubq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsubq_m_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.i32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll new file mode 100644 index 000000000000..1e9cabf7cc0a --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <8 x half> @test_vcvttq_f16_f32(<8 x half> %a, <4 x float> %b) { +; CHECK-LABEL: test_vcvttq_f16_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcvtt.f16.f32 q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half> %a, <4 x float> %b, i32 1) + ret <8 x half> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vcvtbq_f16_f32(<8 x half> %a, <4 x float> %b) { +; CHECK-LABEL: test_vcvtbq_f16_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcvtb.f16.f32 q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half> %a, <4 x float> %b, i32 0) + ret <8 x half> %0 +} + +declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32) + +define arm_aapcs_vfpcc <8 x half> @test_vcvttq_m_f16_f32(<8 x half> %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vcvttq_m_f16_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcvttt.f16.f32 q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half> %a, <4 x float> %b, i32 1, <4 x i1> %1) + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vcvtbq_m_f16_f32(<8 x half> %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vcvtbq_m_f16_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcvtbt.f16.f32 q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half> %a, <4 x float> %b, i32 0, <4 x i1> %1) + ret <8 x half> %2 +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll new file mode 100644 index 000000000000..a7c37802065f --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc i32 @test_vminvq_u32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vminvq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.minv.u.v4i32(i32 %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vmaxvq_u8(i32 %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmaxvq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.maxv.u.v16i8(i32 %a, <16 x i8> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vminvq_s16(i32 %a, <8 x i16> %b) { +; CHECK-LABEL: test_vminvq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.minv.s.v8i16(i32 %a, <8 x i16> %b) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.minv.u.v4i32(i32, <4 x i32>) +declare i32 @llvm.arm.mve.maxv.u.v16i8(i32, <16 x i8>) +declare i32 @llvm.arm.mve.minv.s.v8i16(i32, <8 x i16>)