[X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min) intrinsics to Clang .

After LGTM and Check-all Vector-reduction arithmetic accepts vectors as inputs and produces scalars as outputs.This class of vector operation forms the basis of many scientific computations. In vector-reduction arithmetic, the evaluation off is independent of the order of the input elements of V. Reviewer: 1. craig.topper 2. igorb Differential Revision: https://reviews.llvm.org/D25988 llvm-svn: 285493
2016-10-29 10:29:20 +00:00 · 2016-10-29 10:29:20 +00:00 · 25eb420233
parent 519b4ccd70
commit 25eb420233
2 changed files with 717 additions and 0 deletions
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@ -9904,6 +9904,286 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
 }

+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, 2, 3, -1, -1, -1, -1),  \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 4, 5, 6, 7, -1, -1, -1, -1)); \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 2, 3, -1, -1, -1, -1, -1,     \
+                                                 -1));                         \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                0, -1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                1, -1, -1, -1, -1, -1, -1, -1))\
+                                                ;                              \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_epi64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_min_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+//                   {max_epu,0x0000000000000000}
+//                   {max_pd, 0xFFF0000000000000}
+//                   {min_epi,0x7FFFFFFFFFFFFFFF}
+//                   {min_epu,0xFFFFFFFFFFFFFFFF}
+//                   {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask8)Mask,                                   \
+                             (__v8d##T2)Vec512,                                \
+                             (__v8d##T2)Vec512Neutral);                        \
+    _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+                                  max_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+                                  max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0xFFF0000000000000),
+                                  max_pd, d, f, pd, __M);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                  min_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+                                  min_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0x7FF0000000000000),
+                                  min_pd, d, f, pd, __M);
+}
+
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, 4, 5, 6, 7,                      \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  8, 9, 10, 11, 12, 13, 14, 15,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  4, 5, 6, 7, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  2, 3, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0,  -1, -1, -1, -1, -1, -1, -1,              \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  1, -1, -1, -1, -1, -1, -1, -1,               \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, max_ps, , f);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, min_ps, , f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+//                   {max_epu,0x00000000}
+//                   {max_ps, 0xFF800000}
+//                   {min_epi,0x7FFFFFFF}
+//                   {min_epu,0xFFFFFFFF}
+//                   {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                                        (__mmask16)Mask,                       \
+                                        (__v16s##T2)Vec512,                    \
+                                        (__v16s##T2)Vec512Neutral);            \
+   _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2);                     \
+   })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0xFF800000), max_ps, , f,
+                                  ps, __M);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0x7F800000), min_ps, , f,
+                                  ps, __M);
+}
+
 #undef __DEFAULT_FN_ATTRS

 #endif // __AVX512FINTRIN_H
--- a/clang/test/CodeGen/avx512-reduceMinMaxIntrin.c
+++ b/clang/test/CodeGen/avx512-reduceMinMaxIntrin.c
@ -0,0 +1,437 @@
+// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror |opt -instnamer -S |FileCheck %s
+
+#include <immintrin.h>
+
+long long test_mm512_reduce_max_epi64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epi64(__W);
+}
+
+unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epu64(__W); 
+}
+
+double test_mm512_reduce_max_pd(__m512d __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_reduce_max_pd(__W); 
+}
+
+long long test_mm512_reduce_min_epi64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epi64(__W);
+}
+
+unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
+  // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
+  // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_reduce_max_epu64(__W); 
+}
+
+double test_mm512_reduce_min_pd(__m512d __W){
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_reduce_min_pd(__W); 
+}
+
+long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp sgt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_max_epi64(__M, __W); 
+}
+
+unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_max_epu64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
+  // CHECK: %conv = fptosi double %vecext.i to i64
+  // CHECK: ret i64 %conv
+  return _mm512_mask_reduce_max_pd(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp slt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp slt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp slt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_min_epi64(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
+  // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
+  // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
+  // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
+  // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
+  // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
+  // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
+  // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
+  // CHECK: ret i64 %vecext.i
+  return _mm512_mask_reduce_max_epu64(__M, __W); 
+}
+
+double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
+  // CHECK: %tmp = bitcast i8 %__M to <8 x i1>
+  // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000>
+  // CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
+  // CHECK: ret double %vecext.i
+  return _mm512_mask_reduce_min_pd(__M, __W); 
+}
+
+int test_mm512_reduce_max_epi32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp sgt <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_max_epi32(__W);
+}
+
+unsigned int test_mm512_reduce_max_epu32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp ugt <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_max_epu32(__W); 
+}
+
+float test_mm512_reduce_max_ps(__m512 __W){
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_reduce_max_ps(__W); 
+}
+
+int test_mm512_reduce_min_epi32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp slt <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_min_epi32(__W);
+}
+
+unsigned int test_mm512_reduce_min_epu32(__m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = icmp ult <16 x i32> %tmp, %shuffle1.i
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle3.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
+  // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle6.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
+  // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle9.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
+  // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_reduce_min_epu32(__W); 
+}
+
+float test_mm512_reduce_min_ps(__m512 __W){
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_reduce_min_ps(__W); 
+}
+
+int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp sgt <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_max_epi32(__M, __W); 
+}
+
+unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> zeroinitializer
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp ugt <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_max_epu32(__M, __W); 
+}
+
+float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
+  // CHECK: %tmp = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_mask_reduce_max_ps(__M, __W); 
+}
+
+int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp slt <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_min_epi32(__M, __W); 
+}
+
+unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
+  // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
+  // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle1.i
+  // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
+  // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle4.i
+  // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
+  // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle7.i
+  // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
+  // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp9 = icmp ult <16 x i32> %tmp8, %shuffle10.i
+  // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
+  // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
+  // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
+  // CHECK: %conv.i = trunc i64 %vecext.i to i32
+  // CHECK: ret i32 %conv.i
+  return _mm512_mask_reduce_min_epu32(__M, __W); 
+}
+
+float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
+  // CHECK: %tmp = bitcast i16 %__M to <16 x i1>
+  // CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000>
+  // CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
+  // CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
+  // CHECK: ret float %vecext.i
+  return _mm512_mask_reduce_min_ps(__M, __W); 
+}
+