[ARM] Begin adding IR intrinsics for MVE instructions.

This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-07 17:00:51 +01:00 · 2019-10-07 17:00:51 +01:00 · 1b45297e01
parent b2a65f0d70
commit 1b45297e01
6 changed files with 391 additions and 59 deletions
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@ -787,4 +787,34 @@ def int_arm_gnu_eabi_mcount : Intrinsic<[],
                                    [],
                                    [IntrReadMem, IntrWriteMem]>;

+def int_arm_mve_pred_i2v : Intrinsic<
+  [llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_pred_v2i : Intrinsic<
+  [llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
+                                    list<IntrinsicProperty> props = [],
+                                    string name = "",
+                                    list<SDNodeProperty> sdprops = []> {
+  def _s: Intrinsic<rets, params, props, name, sdprops>;
+  def _u: Intrinsic<rets, params, props, name, sdprops>;
+}
+
+def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_sub_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+
+defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
+   [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
+defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
+   [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
+
+def int_arm_mve_vcvt_narrow: Intrinsic<[llvm_v8f16_ty],
+   [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vcvt_narrow_predicated: Intrinsic<[llvm_v8f16_ty],
+   [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem]>;
+
 } // end TargetPrefix
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -3698,6 +3698,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
  case Intrinsic::arm_neon_vtbl2:
    return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::arm_mve_pred_i2v:
+  case Intrinsic::arm_mve_pred_v2i:
+    return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
  }
 }

--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@ -275,6 +275,62 @@ class mve_addr_q_shift<int shift> : MemOperand {
  let MIOperandInfo = (ops MQPR:$base, i32imm:$imm);
 }

+// A family of classes wrapping up information about the vector types
+// used by MVE.
+class MVEVectorVTInfo<ValueType vec, ValueType pred, bits<2> size,
+                      string suffix, bit unsigned> {
+  // The LLVM ValueType representing the vector, so we can use it in
+  // ISel patterns.
+  ValueType Vec = vec;
+
+  // An LLVM ValueType representing a corresponding vector of
+  // predicate bits, for use in ISel patterns that handle an IR
+  // intrinsic describing the predicated form of the instruction.
+  //
+  // Usually, for a vector of N things, this will be vNi1. But for
+  // vectors of 2 values, we make an exception, and use v4i1 instead
+  // of v2i1. Rationale: MVE codegen doesn't support doing all the
+  // auxiliary operations on v2i1 (vector shuffles etc), and also,
+  // there's no MVE compare instruction that will _generate_ v2i1
+  // directly.
+  ValueType Pred = pred;
+
+  // The most common representation of the vector element size in MVE
+  // instruction encodings: a 2-bit value V representing an (8<<V)-bit
+  // vector element.
+  bits<2> Size = size;
+
+  // For vectors explicitly mentioning a signedness of integers: 0 for
+  // signed and 1 for unsigned. For anything else, undefined.
+  bit Unsigned = unsigned;
+
+  // The suffix used on the instruction in assembly language.
+  string Suffix = suffix;
+}
+
+// Integer vector types that don't treat signed and unsigned differently.
+def MVE_v16i8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "i8",  ?>;
+def MVE_v8i16 : MVEVectorVTInfo<v8i16, v8i1,  0b01, "i16", ?>;
+def MVE_v4i32 : MVEVectorVTInfo<v4i32, v4i1,  0b10, "i32", ?>;
+def MVE_v2i64 : MVEVectorVTInfo<v2i64, v4i1,  0b11, "i64", ?>;
+
+// Explicitly signed and unsigned integer vectors. They map to the
+// same set of LLVM ValueTypes as above, but are represented
+// differently in assembly and instruction encodings.
+def MVE_v16s8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "s8",  0b0>;
+def MVE_v8s16 : MVEVectorVTInfo<v8i16, v8i1,  0b01, "s16", 0b0>;
+def MVE_v4s32 : MVEVectorVTInfo<v4i32, v4i1,  0b10, "s32", 0b0>;
+def MVE_v2s64 : MVEVectorVTInfo<v2i64, v4i1,  0b11, "s64", 0b0>;
+def MVE_v16u8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "u8",  0b1>;
+def MVE_v8u16 : MVEVectorVTInfo<v8i16, v8i1,  0b01, "u16", 0b1>;
+def MVE_v4u32 : MVEVectorVTInfo<v4i32, v4i1,  0b10, "u32", 0b1>;
+def MVE_v2u64 : MVEVectorVTInfo<v2i64, v4i1,  0b11, "u64", 0b1>;
+
+// FP vector types.
+def MVE_v8f16 : MVEVectorVTInfo<v8f16, v8i1,  0b01, "f16", ?>;
+def MVE_v4f32 : MVEVectorVTInfo<v4f32, v4i1,  0b10, "f32", ?>;
+def MVE_v2f64 : MVEVectorVTInfo<v2f64, v4i1,  0b11, "f64", ?>;
+
 // --------- Start of base classes for the instructions themselves

 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
@ -658,17 +714,31 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
  let Inst{0} = 0b0;
 }

-multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def s8  : MVE_VMINMAXV<iname, "s8",  0b0, 0b00, 0b1, bit_7>;
-  def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b1, bit_7>;
-  def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b1, bit_7>;
-  def u8  : MVE_VMINMAXV<iname, "u8",  0b1, 0b00, 0b1, bit_7>;
-  def u16 : MVE_VMINMAXV<iname, "u16", 0b1, 0b01, 0b1, bit_7>;
-  def u32 : MVE_VMINMAXV<iname, "u32", 0b1, 0b10, 0b1, bit_7>;
+multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
+                          MVEVectorVTInfo VTI, Intrinsic intr> {
+  def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
+                       bit_17, bit_7>;
+
+  let Predicates = [HasMVEInt] in
+  def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
+                 (i32 (!cast<Instruction>(NAME)
+                           (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
 }

-defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>;
-defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>;
+multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
+                           Intrinsic intr_s, Intrinsic intr_u> {
+  defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
+  defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
+  defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
+  defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
+  defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
+  defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
+}
+
+defm MVE_VMINV : MVE_VMINMAXV_ty<
+  "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<
+  "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;

 let Predicates = [HasMVEInt] in {
  def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
@ -1491,36 +1561,38 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
  let validForTailPredication = 1;
 }

-class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]>
-  : MVE_VADDSUB<"vadd", suffix, size, 0b0, pattern>;
-class MVE_VSUB<string suffix, bits<2> size, list<dag> pattern=[]>
-  : MVE_VADDSUB<"vsub", suffix, size, 0b1, pattern>;
+multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                         SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>;

-def MVE_VADDi8  : MVE_VADD<"i8",  0b00>;
-def MVE_VADDi16 : MVE_VADD<"i16", 0b01>;
-def MVE_VADDi32 : MVE_VADD<"i32", 0b10>;
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated add/subtract
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;

-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VADDi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VADDi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VADDi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+    // Predicated add/subtract
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 1), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
 }

-def MVE_VSUBi8  : MVE_VSUB<"i8",  0b00>;
-def MVE_VSUBi16 : MVE_VSUB<"i16", 0b01>;
-def MVE_VSUBi32 : MVE_VSUB<"i32", 0b10>;
+multiclass MVE_VADD<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
+multiclass MVE_VSUB<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;

-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VSUBi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VSUBi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VSUBi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-}
+defm MVE_VADDi8  : MVE_VADD<MVE_v16i8>;
+defm MVE_VADDi16 : MVE_VADD<MVE_v8i16>;
+defm MVE_VADDi32 : MVE_VADD<MVE_v4i32>;
+
+defm MVE_VSUBi8  : MVE_VSUB<MVE_v16i8>;
+defm MVE_VSUBi16 : MVE_VSUB<MVE_v8i16>;
+defm MVE_VSUBi32 : MVE_VSUB<MVE_v4i32>;

 class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
                   bits<2> size, ValueType vt>
@ -2763,31 +2835,35 @@ let Predicates = [HasMVEFloat] in {
            (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
 }

+multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
+                            SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
+    let validForTailPredication = 1;
+  }

-let validForTailPredication = 1 in {
-  def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
-  def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 1), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
 }

-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
-            (v4f32 (MVE_VADDf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
-  def : Pat<(v8f16 (fadd (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
-            (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
-}
+multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>;
+multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>;

+defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32>;
+defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>;

-let validForTailPredication = 1 in {
-  def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
-  def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
-}
-
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
-            (v4f32 (MVE_VSUBf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
-  def : Pat<(v8f16 (fsub (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
-            (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
-}
+defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>;
+defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>;

 class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]>
  : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
@ -3551,13 +3627,31 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
  let Predicates = [HasMVEFloat];
 }

-multiclass MVE_VCVT_ff_halves<string suffix, bit op> {
-  def bh : MVE_VCVT_ff<"vcvtb", suffix, op, 0b0>;
-  def th : MVE_VCVT_ff<"vcvtt", suffix, op, 0b1>;
+multiclass MVE_VCVT_f2h_m<string iname, int half> {
+  def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>;
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(v8f16 (int_arm_mve_vcvt_narrow
+                         (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))),
+              (v8f16 (!cast<Instruction>(NAME)
+                         (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>;
+    def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated
+                         (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half),
+                         (v4i1 VCCR:$mask))),
+              (v8f16 (!cast<Instruction>(NAME)
+                         (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
+                         (i32 1), (v4i1 VCCR:$mask)))>;
+  }
 }

-defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>;
-defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>;
+multiclass MVE_VCVT_h2f_m<string iname, int half> {
+  def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>;
+}
+
+defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
+defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>;
+defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>;
+defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>;

 class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
                 string cstr="", list<dag> pattern=[]>
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = add <4 x i32> %b, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vaddq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f32 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fadd <4 x float> %b, %a
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: test_vsubq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fsub <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = sub <8 x i16> %a, %b
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+declare <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  ret <8 x half> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+
+declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)
+
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsubq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_m_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x half> @test_vcvttq_f16_f32(<8 x half> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vcvttq_f16_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcvtt.f16.f32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half> %a, <4 x float> %b, i32 1)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vcvtbq_f16_f32(<8 x half> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vcvtbq_f16_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcvtb.f16.f32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half> %a, <4 x float> %b, i32 0)
+  ret <8 x half> %0
+}
+
+declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32)
+
+define arm_aapcs_vfpcc <8 x half> @test_vcvttq_m_f16_f32(<8 x half> %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vcvttq_m_f16_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcvttt.f16.f32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half> %a, <4 x float> %b, i32 1, <4 x i1> %1)
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vcvtbq_m_f16_f32(<8 x half> %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vcvtbq_m_f16_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcvtbt.f16.f32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half> %a, <4 x float> %b, i32 0, <4 x i1> %1)
+  ret <8 x half> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>)
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc i32 @test_vminvq_u32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vminvq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.u32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.minv.u.v4i32(i32 %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_u8(i32 %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmaxvq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.u8 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.maxv.u.v16i8(i32 %a, <16 x i8> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_s16(i32 %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminvq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s16 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.minv.s.v8i16(i32 %a, <8 x i16> %b)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.minv.u.v4i32(i32, <4 x i32>)
+declare i32 @llvm.arm.mve.maxv.u.v16i8(i32, <16 x i8>)
+declare i32 @llvm.arm.mve.minv.s.v8i16(i32, <8 x i16>)