[ARM,MVE] Add ACLE intrinsics for the vminv/vmaxv family.

Summary: I've implemented these as target-specific IR intrinsics, because they're not //quite// enough like @llvm.experimental.vector.reduce.min (which doesn't take the extra scalar parameter). Also this keeps the predicated and unpredicated versions looking similar, and the floating-point minnm/maxnm versions fold into the same schema. We had a couple of min/max reductions already implemented, from the initial pathfinding exercise in D67158. Those were done by having separate IR intrinsic names for the signed and unsigned integer versions; as part of this commit, I've changed them to use a flag parameter indicating signedness, which is how we ended up deciding that the rest of the MVE intrinsics family ought to work. So now hopefully the ewhole lot is consistent. In the new llc test, the output code from the `v8f16` test functions looks quite unpleasant, but most of it is PCS lowering (you can't pass a `half` directly in or out of a function). In other circumstances, where you do something else with your `half` in the same function, it doesn't look nearly as nasty. Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76490
2020-03-20 15:40:03 +00:00 · 2020-03-20 15:40:03 +00:00 · 45a9945b9e
parent eddede9d51
commit 45a9945b9e
6 changed files with 1762 additions and 102 deletions
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@ -536,11 +536,42 @@ let params = T.Float in {
                                    (IRInt<"vmaxnma_predicated", [Vector,Predicate]> $a, $b, $pred)>;
 }

+multiclass Reduction<Type Accumulator, string basename, list<Type> basetypes,
+                     bit needSign = 0,
+                     dag postCG = (seq (id $ret)),
+                     dag accArg = (args Accumulator:$prev),
+                     dag preCG = (seq)> {
+  defvar intArgsBase   = (? $prev, $vec);
+  defvar intArgsUnpred = !con(intArgsBase,
+                              !if(needSign, (? (unsignedflag Scalar)), (?)));
+  defvar intArgsPred   = !con(intArgsUnpred, (? $pred));
+  defvar intUnpred     = !setop(intArgsUnpred, IRInt<basename, basetypes>);
+  defvar intPred       = !setop(intArgsPred, IRInt<
+            basename#"_predicated", !listconcat(basetypes, [Predicate])>);
+
+  def "": Intrinsic<
+    Accumulator, !con(accArg, (args Vector:$vec)),
+    !con(preCG, (seq intUnpred:$ret), postCG)>;
+  def _p: Intrinsic<
+    Accumulator, !con(accArg, (args Vector:$vec, Predicate:$pred)),
+    !con(preCG, (seq intPred:$ret), postCG)>;
+}
+
 let params = T.Int in {
-def vminvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),
-    (Scalar (IRInt<"minv", [Vector], 1> $prev, $vec))>;
-def vmaxvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),
-    (Scalar (IRInt<"maxv", [Vector], 1> $prev, $vec))>;
+defm vminvq: Reduction<Scalar, "minv", [Vector], 1, (seq (Scalar $ret))>;
+defm vmaxvq: Reduction<Scalar, "maxv", [Vector], 1, (seq (Scalar $ret))>;
+}
+
+let params = T.Signed in {
+defm vminavq: Reduction<UScalar, "minav", [Vector], 0, (seq (UScalar $ret))>;
+defm vmaxavq: Reduction<UScalar, "maxav", [Vector], 0, (seq (UScalar $ret))>;
+}
+
+let params = T.Float in {
+defm vminnmvq: Reduction<Scalar, "minnmv", [Scalar, Vector]>;
+defm vmaxnmvq: Reduction<Scalar, "maxnmv", [Scalar, Vector]>;
+defm vminnmavq: Reduction<Scalar, "minnmav", [Scalar, Vector]>;
+defm vmaxnmavq: Reduction<Scalar, "maxnmav", [Scalar, Vector]>;
 }

 foreach half = [ "b", "t" ] in {
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
@ -1,97 +1,853 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s

 #include <arm_mve.h>

 // CHECK-LABEL: @test_vminvq_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.s.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
 // CHECK-NEXT:    ret i8 [[TMP2]]
 //
-int8_t test_vminvq_s8(int8_t a, int8x16_t b)
-{
+int8_t test_vminvq_s8(int8_t a, int8x16_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_s8(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_s8(a, b);
 #endif /* POLYMORPHIC */
 }

 // CHECK-LABEL: @test_vminvq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.s.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
 //
-int16_t test_vminvq_s16(int16_t a, int16x8_t b)
-{
+int16_t test_vminvq_s16(int16_t a, int16x8_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_s16(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_s16(a, b);
 #endif /* POLYMORPHIC */
 }

 // CHECK-LABEL: @test_vminvq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.s.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-int32_t test_vminvq_s32(int32_t a, int32x4_t b)
-{
+int32_t test_vminvq_s32(int32_t a, int32x4_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_s32(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_s32(a, b);
 #endif /* POLYMORPHIC */
 }

 // CHECK-LABEL: @test_vminvq_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.u.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
 // CHECK-NEXT:    ret i8 [[TMP2]]
 //
-uint8_t test_vminvq_u8(uint8_t a, uint8x16_t b)
-{
+uint8_t test_vminvq_u8(uint8_t a, uint8x16_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_u8(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_u8(a, b);
 #endif /* POLYMORPHIC */
 }

 // CHECK-LABEL: @test_vminvq_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.u.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
 //
-uint16_t test_vminvq_u16(uint16_t a, uint16x8_t b)
-{
+uint16_t test_vminvq_u16(uint16_t a, uint16x8_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_u16(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_u16(a, b);
 #endif /* POLYMORPHIC */
 }

 // CHECK-LABEL: @test_vminvq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.u.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b)
-{
+uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_u32(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+int8_t test_vmaxvq_s8(int8_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+int16_t test_vmaxvq_s16(int16_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmaxvq_s32(int32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+uint8_t test_vmaxvq_u8(uint8_t a, uint8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+uint16_t test_vmaxvq_u16(uint16_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmaxvq_u32(uint32_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minav.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+uint8_t test_vminavq_s8(uint8_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vminavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminavq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minav.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+uint16_t test_vminavq_s16(uint16_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vminavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminavq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minav.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vminavq_s32(uint32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vminavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminavq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxav.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+uint8_t test_vmaxavq_s8(uint8_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmaxavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxavq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxav.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+uint16_t test_vmaxavq_s16(uint16_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxavq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxav.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmaxavq_s32(uint32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxavq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vminnmvq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vminnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmvq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.minnmv.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vminnmvq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vminnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmvq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vminnmavq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vminnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmavq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.minnmav.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vminnmavq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vminnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmavq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vmaxnmvq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.maxnmv.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vmaxnmvq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vmaxnmavq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.maxnmav.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vmaxnmavq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+int8_t test_vminvq_p_s8(int8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+int16_t test_vminvq_p_s16(int16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vminvq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vminvq_p_u8(uint8_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vminvq_p_u16(uint16_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vminvq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+int8_t test_vmaxvq_p_s8(int8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+int16_t test_vmaxvq_p_s16(int16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmaxvq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vmaxvq_p_u8(uint8_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vmaxvq_p_u16(uint16_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmaxvq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vminavq_p_s8(uint8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminavq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vminavq_p_s16(uint16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminavq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vminavq_p_s32(uint32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminavq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vmaxavq_p_s8(uint8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxavq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vmaxavq_p_s16(uint16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxavq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmaxavq_p_s32(uint32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxavq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vminnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmvq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vminnmvq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmvq_p_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vminnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmavq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vminnmavq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmavq_p_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vmaxnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vmaxnmvq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_p_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vmaxnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vmaxnmavq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_p_f32(a, b, p);
 #endif /* POLYMORPHIC */
 }
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@ -798,14 +798,6 @@ def int_arm_mve_pred_v2i : Intrinsic<
 def int_arm_mve_vreinterpretq : Intrinsic<
  [llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;

-multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
-                                    list<IntrinsicProperty> props = [],
-                                    string name = "",
-                                    list<SDNodeProperty> sdprops = []> {
-  def _s: Intrinsic<rets, params, props, name, sdprops>;
-  def _u: Intrinsic<rets, params, props, name, sdprops>;
-}
-
 def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
    llvm_anyvector_ty, LLVMMatchType<0>],
@ -891,11 +883,6 @@ def int_arm_mve_vmaxnma_predicated: Intrinsic<[llvm_anyvector_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
    [IntrNoMem]>;

-defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
-   [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
-defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
-   [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
-
 multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,
                         LLVMType pred = llvm_anyvector_ty,
                         list<IntrinsicProperty> props = [IntrNoMem]> {
@ -911,6 +898,19 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
          LLVMMatchType<0>, rets[0])], props>;
 }

+multiclass MVE_minmaxv {
+  defm v: MVEPredicated<[llvm_i32_ty],
+     [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>;
+  defm av: MVEPredicated<[llvm_i32_ty],
+     [llvm_i32_ty, llvm_anyvector_ty]>;
+  defm nmv: MVEPredicated<[llvm_anyfloat_ty],
+     [LLVMMatchType<0>, llvm_anyvector_ty]>;
+  defm nmav: MVEPredicated<[llvm_anyfloat_ty],
+     [LLVMMatchType<0>, llvm_anyvector_ty]>;
+}
+defm int_arm_mve_min: MVE_minmaxv;
+defm int_arm_mve_max: MVE_minmaxv;
+
 // Intrinsic with a predicated and a non-predicated case. The predicated case
 // has two additional parameters: inactive (the value for inactive lanes, can
 // be undef) and predicate.
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -14356,6 +14356,23 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
      return SDValue();
    break;
  }
+
+  case Intrinsic::arm_mve_minv:
+  case Intrinsic::arm_mve_maxv:
+  case Intrinsic::arm_mve_minav:
+  case Intrinsic::arm_mve_maxav:
+  case Intrinsic::arm_mve_minv_predicated:
+  case Intrinsic::arm_mve_maxv_predicated:
+  case Intrinsic::arm_mve_minav_predicated:
+  case Intrinsic::arm_mve_maxav_predicated: {
+    // These intrinsics all take an i32 scalar operand which is narrowed to the
+    // size of a single lane of the vector type they take as the other input.
+    unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
  }

  return SDValue();
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@ -742,21 +742,42 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
  let Predicates = [HasMVEFloat];
 }

-multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>;
-  def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>;
+multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin,
+                            MVEVectorVTInfo VTI, string intrBaseName,
+                            ValueType Scalar, RegisterClass ScalarReg> {
+  def "": MVE_VMINMAXNMV<iname, VTI.Suffix, VTI.Size{0}, notAbs, isMin>;
+  defvar Inst        = !cast<Instruction>(NAME);
+  defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+  defvar pred_intr   = !cast<Intrinsic>(intrBaseName#"_predicated");
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev),
+                                   (VTI.Vec MQPR:$vec))),
+           (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+                                   (VTI.Vec MQPR:$vec)),
+                              ScalarReg)>;
+    def : Pat<(Scalar (pred_intr   (Scalar ScalarReg:$prev),
+                                   (VTI.Vec MQPR:$vec),
+                                   (VTI.Pred VCCR:$pred))),
+           (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+                                   (VTI.Vec MQPR:$vec),
+                                   ARMVCCThen, (VTI.Pred VCCR:$pred)),
+                              ScalarReg)>;
+  }
 }

-defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>;
-defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>;
-
-multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>;
-  def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>;
+multiclass MVE_VMINMAXNMV_fty<string iname, bit notAbs, bit isMin,
+                              string intrBase> {
+  defm f32 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v4f32, intrBase,
+                              f32, SPR>;
+  defm f16 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v8f16, intrBase,
+                              f16, HPR>;
 }

-defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>;
-defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>;
+defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv",  1, 1, "int_arm_mve_minnmv">;
+defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv",  1, 0, "int_arm_mve_maxnmv">;
+defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">;
+defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">;

 class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
                 bit bit_17, bit bit_7, list<dag> pattern=[]>
@ -778,31 +799,37 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
  let Inst{0} = 0b0;
 }

-multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
-                          MVEVectorVTInfo VTI, Intrinsic intr> {
+multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin,
+                          MVEVectorVTInfo VTI, string intrBaseName> {
  def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
-                       bit_17, bit_7>;
-  defvar Inst = !cast<Instruction>(NAME);
+                       notAbs, isMin>;
+  defvar Inst        = !cast<Instruction>(NAME);
+  defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+  defvar pred_intr   = !cast<Intrinsic>(intrBaseName#"_predicated");
+  defvar base_args   = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec));
+  defvar args        = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))),
+                           base_args);

-  let Predicates = [HasMVEInt] in
-  def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
-                 (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(i32 !con(args, (unpred_intr))),
+              (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+    def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))),
+              (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec),
+                         ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+  }
 }

-multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
-                           Intrinsic intr_s, Intrinsic intr_u> {
-  defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
-  defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
-  defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
-  defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
-  defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
-  defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
+multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
+  defm s8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16s8, intrBaseName>;
+  defm s16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8s16, intrBaseName>;
+  defm s32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4s32, intrBaseName>;
+  defm u8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16u8, intrBaseName>;
+  defm u16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8u16, intrBaseName>;
+  defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>;
 }

-defm MVE_VMINV : MVE_VMINMAXV_ty<
-  "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
-defm MVE_VMAXV : MVE_VMINMAXV_ty<
-  "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;
+defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">;

 let Predicates = [HasMVEInt] in {
  def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
@ -833,14 +860,14 @@ let Predicates = [HasMVEInt] in {

 }

-multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def s8  : MVE_VMINMAXV<iname, "s8",  0b0, 0b00, 0b0, bit_7>;
-  def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
-  def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>;
+multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> {
+  defm s8 : MVE_VMINMAXV_p<iname, 0, isMin, MVE_v16s8, intrBaseName>;
+  defm s16: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v8s16, intrBaseName>;
+  defm s32: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v4s32, intrBaseName>;
 }

-defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
-defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
+defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">;
+defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">;

 class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
@ -1,36 +1,865 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s

+define arm_aapcs_vfpcc signext i8 @test_vminvq_s8(i8 signext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vminvq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v16i8(i32 %0, <16 x i8> %b, i32 0)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vminvq_s16(i16 signext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminvq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v8i16(i32 %0, <8 x i16> %b, i32 0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vminvq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %a, <4 x i32> %b, i32 0)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vminvq_u8(i8 zeroext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vminvq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v16i8(i32 %0, <16 x i8> %b, i32 1)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminvq_u16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminvq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v8i16(i32 %0, <8 x i16> %b, i32 1)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
 define arm_aapcs_vfpcc i32 @test_vminvq_u32(i32 %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_vminvq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vminv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call i32 @llvm.arm.mve.minv.u.v4i32(i32 %a, <4 x i32> %b)
+  %0 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %a, <4 x i32> %b, i32 1)
  ret i32 %0
 }

-define arm_aapcs_vfpcc i32 @test_vmaxvq_u8(i32 %a, <16 x i8> %b) {
+define arm_aapcs_vfpcc signext i8 @test_vmaxvq_s8(i8 signext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmaxvq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v16i8(i32 %0, <16 x i8> %b, i32 0)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vmaxvq_s16(i16 signext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmaxvq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v8i16(i32 %0, <8 x i16> %b, i32 0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmaxvq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.maxv.v4i32(i32 %a, <4 x i32> %b, i32 0)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxvq_u8(i8 zeroext %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_vmaxvq_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call i32 @llvm.arm.mve.maxv.u.v16i8(i32 %a, <16 x i8> %b)
-  ret i32 %0
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v16i8(i32 %0, <16 x i8> %b, i32 1)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
 }

-define arm_aapcs_vfpcc i32 @test_vminvq_s16(i32 %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vminvq_s16:
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxvq_u16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmaxvq_u16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminv.s16 r0, q0
+; CHECK-NEXT:    vmaxv.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call i32 @llvm.arm.mve.minv.s.v8i16(i32 %a, <8 x i16> %b)
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v8i16(i32 %0, <8 x i16> %b, i32 1)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_u32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmaxvq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.u32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.maxv.v4i32(i32 %a, <4 x i32> %b, i32 1)
  ret i32 %0
 }

-declare i32 @llvm.arm.mve.minv.u.v4i32(i32, <4 x i32>)
-declare i32 @llvm.arm.mve.maxv.u.v16i8(i32, <16 x i8>)
-declare i32 @llvm.arm.mve.minv.s.v8i16(i32, <8 x i16>)
+define arm_aapcs_vfpcc zeroext i8 @test_vminavq_s8(i8 zeroext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vminavq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminav.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minav.v16i8(i32 %0, <16 x i8> %b)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminavq_s16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminav.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minav.v8i16(i32 %0, <8 x i16> %b)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vminavq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vminavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminav.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.minav.v4i32(i32 %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxavq_s8(i8 zeroext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmaxavq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxav.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxav.v16i8(i32 %0, <16 x i8> %b)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxavq_s16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmaxavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxav.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxav.v8i16(i32 %0, <8 x i16> %b)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxavq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmaxavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxav.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.maxav.v4i32(i32 %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vminnmvq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmv.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.minnmv.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vminnmvq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmv.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.minnmv.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vminnmavq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmav.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.minnmav.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vminnmavq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmav.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.minnmav.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vmaxnmvq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmv.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.maxnmv.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vmaxnmvq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmv.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.maxnmv.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vmaxnmavq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmav.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.maxnmav.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vmaxnmavq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmav.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.maxnmav.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc signext i8 @test_vminvq_p_s8(i8 signext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 0, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vminvq_p_s16(i16 signext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 0, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vminvq_p_u8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 1, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminvq_p_u16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 1, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.u32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 1, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc signext i8 @test_vmaxvq_p_s8(i8 signext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 0, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vmaxvq_p_s16(i16 signext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 0, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxvq_p_u8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 1, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxvq_p_u16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 1, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.u32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 1, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vminavq_p_s8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminavt.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminavq_p_s16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminavt.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vminavq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminavt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxavq_p_s8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxavt.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxavq_p_s16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxavt.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxavq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxavt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmvq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmvt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmvq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmvt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmavq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmavt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmavq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmavt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmvq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmvt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmvq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmvt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmavq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmavt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmavq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmavt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.minv.v16i8(i32, <16 x i8>, i32)
+declare i32 @llvm.arm.mve.minv.v8i16(i32, <8 x i16>, i32)
+declare i32 @llvm.arm.mve.minv.v4i32(i32, <4 x i32>, i32)
+declare i32 @llvm.arm.mve.maxv.v16i8(i32, <16 x i8>, i32)
+declare i32 @llvm.arm.mve.maxv.v8i16(i32, <8 x i16>, i32)
+declare i32 @llvm.arm.mve.maxv.v4i32(i32, <4 x i32>, i32)
+declare i32 @llvm.arm.mve.minav.v16i8(i32, <16 x i8>)
+declare i32 @llvm.arm.mve.minav.v8i16(i32, <8 x i16>)
+declare i32 @llvm.arm.mve.minav.v4i32(i32, <4 x i32>)
+declare i32 @llvm.arm.mve.maxav.v16i8(i32, <16 x i8>)
+declare i32 @llvm.arm.mve.maxav.v8i16(i32, <8 x i16>)
+declare i32 @llvm.arm.mve.maxav.v4i32(i32, <4 x i32>)
+declare i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32, <16 x i8>, i32, <16 x i1>)
+declare i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32, <8 x i16>, i32, <8 x i1>)
+declare i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32, <4 x i32>, i32, <4 x i1>)
+declare i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32, <16 x i8>, i32, <16 x i1>)
+declare i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32, <8 x i16>, i32, <8 x i1>)
+declare i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32, <4 x i32>, i32, <4 x i1>)
+declare i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32, <4 x i32>, <4 x i1>)
+declare i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32, <4 x i32>, <4 x i1>)
+
+declare half @llvm.arm.mve.minnmv.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.minnmav.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.maxnmv.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+declare half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+declare half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+declare half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+
+declare float @llvm.arm.mve.minnmv.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.minnmav.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.maxnmv.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
+declare float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
+declare float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
+declare float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)