[X86] Use undefined instead of setzero in shufflevector based intrinsics when the second source is unused. Rewrite immediate extractions in shuffle intrinsics to be in ((c >> x) & y) form instead of ((c & z) >> x). This way only x varies between each use instead of having to vary x and z.

llvm-svn: 274525
2016-07-04 22:18:01 +00:00 · 2016-07-04 22:18:01 +00:00 · 2a383c9273
parent c03d610cff
commit 2a383c9273
11 changed files with 228 additions and 217 deletions
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@ -497,40 +497,42 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)

 #define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
-                                   (__v8si)_mm256_setzero_si256(), \
-                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6)); })
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })

 #define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
-                                   (__v16hi)_mm256_setzero_si256(), \
+                                   (__v16hi)_mm256_undefined_si256(), \
                                   0, 1, 2, 3, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6), \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
                                   8, 9, 10, 11, \
-                                   12 + (((imm) & 0x03) >> 0), \
-                                   12 + (((imm) & 0x0c) >> 2), \
-                                   12 + (((imm) & 0x30) >> 4), \
-                                   12 + (((imm) & 0xc0) >> 6)); })
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3)); })

 #define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
-                                   (__v16hi)_mm256_setzero_si256(), \
-                                   0 + (((imm) & 0x03) >> 0), \
-                                   0 + (((imm) & 0x0c) >> 2), \
-                                   0 + (((imm) & 0x30) >> 4), \
-                                   0 + (((imm) & 0xc0) >> 6), \
+                                   (__v16hi)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
                                   4, 5, 6, 7, \
-                                   8 + (((imm) & 0x03) >> 0), \
-                                   8 + (((imm) & 0x0c) >> 2), \
-                                   8 + (((imm) & 0x30) >> 4), \
-                                   8 + (((imm) & 0xc0) >> 6), \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
                                   12, 13, 14, 15); })

 static __inline__ __m256i __DEFAULT_FN_ATTRS
@ -940,9 +942,11 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)

 #define _mm256_permute4x64_pd(V, M) __extension__ ({ \
  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
-                                   (__v4df)_mm256_setzero_pd(), \
-                                   (M) & 0x3, ((M) & 0xc) >> 2, \
-                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })

 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@ -952,16 +956,18 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)

 #define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_setzero_si256(), \
-                                   (M) & 0x3, ((M) & 0xc) >> 2, \
-                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })

 #define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })

 #define _mm256_extracti128_si256(V, M) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_setzero_si256(), \
+                                   (__v4di)_mm256_undefined_si256(), \
                                   (((M) & 1) ? 2 : 0), \
                                   (((M) & 1) ? 3 : 1) ); })

--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@ -1613,27 +1613,27 @@ _mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)

 #define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
-                                   (__v32hi)_mm512_setzero_hi(), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
                                   0, 1, 2, 3, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6), \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
                                   8, 9, 10, 11, \
-                                   12 + (((imm) & 0x03) >> 0), \
-                                   12 + (((imm) & 0x0c) >> 2), \
-                                   12 + (((imm) & 0x30) >> 4), \
-                                   12 + (((imm) & 0xc0) >> 6), \
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3), \
                                   16, 17, 18, 19, \
-                                   20 + (((imm) & 0x03) >> 0), \
-                                   20 + (((imm) & 0x0c) >> 2), \
-                                   20 + (((imm) & 0x30) >> 4), \
-                                   20 + (((imm) & 0xc0) >> 6), \
+                                   20 + (((imm) >> 0) & 0x3), \
+                                   20 + (((imm) >> 2) & 0x3), \
+                                   20 + (((imm) >> 4) & 0x3), \
+                                   20 + (((imm) >> 6) & 0x3), \
                                   24, 25, 26, 27, \
-                                   28 + (((imm) & 0x03) >> 0), \
-                                   28 + (((imm) & 0x0c) >> 2), \
-                                   28 + (((imm) & 0x30) >> 4), \
-                                   28 + (((imm) & 0xc0) >> 6)); })
+                                   28 + (((imm) >> 0) & 0x3), \
+                                   28 + (((imm) >> 2) & 0x3), \
+                                   28 + (((imm) >> 4) & 0x3), \
+                                   28 + (((imm) >> 6) & 0x3)); })

 #define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
@ -1649,26 +1649,26 @@ _mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)

 #define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \
  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
-                                   (__v32hi)_mm512_setzero_hi(), \
-                                   0 + (((imm) & 0x03) >> 0), \
-                                   0 + (((imm) & 0x0c) >> 2), \
-                                   0 + (((imm) & 0x30) >> 4), \
-                                   0 + (((imm) & 0xc0) >> 6), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
                                   4, 5, 6, 7, \
-                                   8 + (((imm) & 0x03) >> 0), \
-                                   8 + (((imm) & 0x0c) >> 2), \
-                                   8 + (((imm) & 0x30) >> 4), \
-                                   8 + (((imm) & 0xc0) >> 6), \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
                                   12, 13, 14, 15, \
-                                   16 + (((imm) & 0x03) >> 0), \
-                                   16 + (((imm) & 0x0c) >> 2), \
-                                   16 + (((imm) & 0x30) >> 4), \
-                                   16 + (((imm) & 0xc0) >> 6), \
+                                   16 + (((imm) >> 0) & 0x3), \
+                                   16 + (((imm) >> 2) & 0x3), \
+                                   16 + (((imm) >> 4) & 0x3), \
+                                   16 + (((imm) >> 6) & 0x3), \
                                   20, 21, 22, 23, \
-                                   24 + (((imm) & 0x03) >> 0), \
-                                   24 + (((imm) & 0x0c) >> 2), \
-                                   24 + (((imm) & 0x30) >> 4), \
-                                   24 + (((imm) & 0xc0) >> 6), \
+                                   24 + (((imm) >> 0) & 0x3), \
+                                   24 + (((imm) >> 2) & 0x3), \
+                                   24 + (((imm) >> 4) & 0x3), \
+                                   24 + (((imm) >> 6) & 0x3), \
                                   28, 29, 30, 31); })


--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@ -6542,15 +6542,15 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,

 #define _mm512_permute_pd(X, C) __extension__ ({ \
  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
-                                   (__v8df)_mm512_setzero_pd(), \
-                                   0 + (((C) & 0x01) >> 0), \
-                                   0 + (((C) & 0x02) >> 1), \
-                                   2 + (((C) & 0x04) >> 2), \
-                                   2 + (((C) & 0x08) >> 3), \
-                                   4 + (((C) & 0x10) >> 4), \
-                                   4 + (((C) & 0x20) >> 5), \
-                                   6 + (((C) & 0x40) >> 6), \
-                                   6 + (((C) & 0x80) >> 7)); })
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1), \
+                                   4 + (((C) >> 4) & 0x1), \
+                                   4 + (((C) >> 5) & 0x1), \
+                                   6 + (((C) >> 6) & 0x1), \
+                                   6 + (((C) >> 7) & 0x1)); })

 #define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@ -6564,23 +6564,23 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,

 #define _mm512_permute_ps(X, C) __extension__ ({ \
  (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
-                                  (__v16sf)_mm512_setzero_ps(), \
-                                   0  + (((C) & 0x03) >> 0), \
-                                   0  + (((C) & 0x0c) >> 2), \
-                                   0  + (((C) & 0x30) >> 4), \
-                                   0  + (((C) & 0xc0) >> 6), \
-                                   4  + (((C) & 0x03) >> 0), \
-                                   4  + (((C) & 0x0c) >> 2), \
-                                   4  + (((C) & 0x30) >> 4), \
-                                   4  + (((C) & 0xc0) >> 6), \
-                                   8  + (((C) & 0x03) >> 0), \
-                                   8  + (((C) & 0x0c) >> 2), \
-                                   8  + (((C) & 0x30) >> 4), \
-                                   8  + (((C) & 0xc0) >> 6), \
-                                   12 + (((C) & 0x03) >> 0), \
-                                   12 + (((C) & 0x0c) >> 2), \
-                                   12 + (((C) & 0x30) >> 4), \
-                                   12 + (((C) & 0xc0) >> 6)); })
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                   0  + (((C) >> 0) & 0x3), \
+                                   0  + (((C) >> 2) & 0x3), \
+                                   0  + (((C) >> 4) & 0x3), \
+                                   0  + (((C) >> 6) & 0x3), \
+                                   4  + (((C) >> 0) & 0x3), \
+                                   4  + (((C) >> 2) & 0x3), \
+                                   4  + (((C) >> 4) & 0x3), \
+                                   4  + (((C) >> 6) & 0x3), \
+                                   8  + (((C) >> 0) & 0x3), \
+                                   8  + (((C) >> 2) & 0x3), \
+                                   8  + (((C) >> 4) & 0x3), \
+                                   8  + (((C) >> 6) & 0x3), \
+                                   12 + (((C) >> 0) & 0x3), \
+                                   12 + (((C) >> 2) & 0x3), \
+                                   12 + (((C) >> 4) & 0x3), \
+                                   12 + (((C) >> 6) & 0x3)); })

 #define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
@ -7170,14 +7170,14 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
 #define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
                                   (__v8df)(__m512d)(B), \
-                                   (((M) & 0x01) >> 0) +  0, \
-                                   (((M) & 0x02) >> 1) +  8, \
-                                   (((M) & 0x04) >> 2) +  2, \
-                                   (((M) & 0x08) >> 3) + 10, \
-                                   (((M) & 0x10) >> 4) +  4, \
-                                   (((M) & 0x20) >> 5) + 12, \
-                                   (((M) & 0x40) >> 6) +  6, \
-                                   (((M) & 0x80) >> 7) + 14); })
+                                   0  + (((M) >> 0) & 0x1), \
+                                   8  + (((M) >> 1) & 0x1), \
+                                   2  + (((M) >> 2) & 0x1), \
+                                   10 + (((M) >> 3) & 0x1), \
+                                   4  + (((M) >> 4) & 0x1), \
+                                   12 + (((M) >> 5) & 0x1), \
+                                   6  + (((M) >> 6) & 0x1), \
+                                   14 + (((M) >> 7) & 0x1)); })

 #define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@ -8686,14 +8686,14 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 #define _mm512_permutex_pd(X, C) __extension__ ({ \
  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
                                   (__v8df)_mm512_undefined_pd(), \
-                                   0 + (((C) & 0x03) >> 0), \
-                                   0 + (((C) & 0x0c) >> 2), \
-                                   0 + (((C) & 0x30) >> 4), \
-                                   0 + (((C) & 0xc0) >> 6), \
-                                   4 + (((C) & 0x03) >> 0), \
-                                   4 + (((C) & 0x0c) >> 2), \
-                                   4 + (((C) & 0x30) >> 4), \
-                                   4 + (((C) & 0xc0) >> 6)); })
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })

 #define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@ -8708,14 +8708,14 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 #define _mm512_permutex_epi64(X, C) __extension__ ({ \
  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
                                   (__v8di)_mm512_undefined_epi32(), \
-                                   0 + (((C) & 0x03) >> 0), \
-                                   0 + (((C) & 0x0c) >> 2), \
-                                   0 + (((C) & 0x30) >> 4), \
-                                   0 + (((C) & 0xc0) >> 6), \
-                                   4 + (((C) & 0x03) >> 0), \
-                                   4 + (((C) & 0x0c) >> 2), \
-                                   4 + (((C) & 0x30) >> 4), \
-                                   4 + (((C) & 0xc0) >> 6)); })
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })

 #define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
@ -9069,23 +9069,23 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)

 #define _mm512_shuffle_epi32(A, I) __extension__ ({ \
  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
-                                   (__v16si)_mm512_setzero_si512(), \
-                                   0  + (((I) & 0x03) >> 0), \
-                                   0  + (((I) & 0x0c) >> 2), \
-                                   0  + (((I) & 0x30) >> 4), \
-                                   0  + (((I) & 0xc0) >> 6), \
-                                   4  + (((I) & 0x03) >> 0), \
-                                   4  + (((I) & 0x0c) >> 2), \
-                                   4  + (((I) & 0x30) >> 4), \
-                                   4  + (((I) & 0xc0) >> 6), \
-                                   8  + (((I) & 0x03) >> 0), \
-                                   8  + (((I) & 0x0c) >> 2), \
-                                   8  + (((I) & 0x30) >> 4), \
-                                   8  + (((I) & 0xc0) >> 6), \
-                                   12 + (((I) & 0x03) >> 0), \
-                                   12 + (((I) & 0x0c) >> 2), \
-                                   12 + (((I) & 0x30) >> 4), \
-                                   12 + (((I) & 0xc0) >> 6)); })
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0  + (((I) >> 0) & 0x3), \
+                                   0  + (((I) >> 2) & 0x3), \
+                                   0  + (((I) >> 4) & 0x3), \
+                                   0  + (((I) >> 6) & 0x3), \
+                                   4  + (((I) >> 0) & 0x3), \
+                                   4  + (((I) >> 2) & 0x3), \
+                                   4  + (((I) >> 4) & 0x3), \
+                                   4  + (((I) >> 6) & 0x3), \
+                                   8  + (((I) >> 0) & 0x3), \
+                                   8  + (((I) >> 2) & 0x3), \
+                                   8  + (((I) >> 4) & 0x3), \
+                                   8  + (((I) >> 6) & 0x3), \
+                                   12 + (((I) >> 0) & 0x3), \
+                                   12 + (((I) >> 2) & 0x3), \
+                                   12 + (((I) >> 4) & 0x3), \
+                                   12 + (((I) >> 6) & 0x3)); })

 #define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@ -8803,8 +8803,8 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 #define _mm256_permutex_pd(X, C) __extension__ ({ \
  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(X), \
                                   (__v4df)_mm256_undefined_pd(), \
-                                   (C) & 0x3, ((C) & 0xc) >> 2, \
-                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })

 #define _mm256_mask_permutex_pd(W, U, X, C) __extension__ ({ \
  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
@ -8819,8 +8819,8 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 #define _mm256_permutex_epi64(X, C) __extension__ ({ \
  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(X), \
                                   (__v4di)_mm256_undefined_si256(), \
-                                   (C) & 0x3, ((C) & 0xc) >> 2, \
-                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })

 #define _mm256_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@ -999,8 +999,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) __extension__ ({ \
  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
-                                   (__v2df)_mm_setzero_pd(), \
-                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+                                   (__v2df)_mm_undefined_pd(), \
+                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })

 /// \brief Copies the values in a 256-bit vector of [4 x double] as
 ///    specified by the immediate integer operand.
@ -1040,10 +1040,11 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) __extension__ ({ \
  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
-                                   (__v4df)_mm256_setzero_pd(), \
-                                   (C) & 0x1, ((C) & 0x2) >> 1, \
-                                   2 + (((C) & 0x4) >> 2), \
-                                   2 + (((C) & 0x8) >> 3)); })
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1)); })

 /// \brief Copies the values in a 128-bit vector of [4 x float] as
 ///    specified by the immediate integer operand.
@ -1099,9 +1100,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) __extension__ ({ \
  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
-                                  (__v4sf)_mm_setzero_ps(), \
-                                   (C) & 0x3, ((C) & 0xc) >> 2, \
-                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+                                  (__v4sf)_mm_undefined_ps(), \
+                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })

 /// \brief Copies the values in a 256-bit vector of [8 x float] as
 ///    specified by the immediate integer operand.
@ -1193,13 +1194,15 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) __extension__ ({ \
  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
-                                  (__v8sf)_mm256_setzero_ps(), \
-                                  (C) & 0x3, ((C) & 0xc) >> 2, \
-                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
-                                  4 + (((C) & 0x03) >> 0), \
-                                  4 + (((C) & 0x0c) >> 2), \
-                                  4 + (((C) & 0x30) >> 4), \
-                                  4 + (((C) & 0xc0) >> 6)); })
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  0 + (((C) >> 0) & 0x3), \
+                                  0 + (((C) >> 2) & 0x3), \
+                                  0 + (((C) >> 4) & 0x3), \
+                                  0 + (((C) >> 6) & 0x3), \
+                                  4 + (((C) >> 0) & 0x3), \
+                                  4 + (((C) >> 2) & 0x3), \
+                                  4 + (((C) >> 4) & 0x3), \
+                                  4 + (((C) >> 6) & 0x3)); })

 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [4 x double], as specified by the immediate integer operand.
@ -1538,16 +1541,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
-        (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
-                                        (__v8sf)(__m256)(b), \
-                                        (mask) & 0x3, \
-                                        ((mask) & 0xc) >> 2, \
-                                        (((mask) & 0x30) >> 4) + 8, \
-                                        (((mask) & 0xc0) >> 6) + 8, \
-                                        ((mask) & 0x3) + 4, \
-                                        (((mask) & 0xc) >> 2) + 4, \
-                                        (((mask) & 0x30) >> 4) + 12, \
-                                        (((mask) & 0xc0) >> 6) + 12); })
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), \
+                                  0  + (((mask) >> 0) & 0x3), \
+                                  0  + (((mask) >> 2) & 0x3), \
+                                  8  + (((mask) >> 4) & 0x3), \
+                                  8  + (((mask) >> 6) & 0x3), \
+                                  4  + (((mask) >> 0) & 0x3), \
+                                  4  + (((mask) >> 2) & 0x3), \
+                                  12 + (((mask) >> 4) & 0x3), \
+                                  12 + (((mask) >> 6) & 0x3)); })

 /// \brief Selects four double-precision values from the 256-bit operands of
 ///    [4 x double], as specified by the immediate value operand. The selected
@ -1591,12 +1594,12 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
-        (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
-                                         (__v4df)(__m256d)(b), \
-                                         (mask) & 0x1, \
-                                         (((mask) & 0x2) >> 1) + 4, \
-                                         (((mask) & 0x4) >> 2) + 2, \
-                                         (((mask) & 0x8) >> 3) + 6); })
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), \
+                                   0 + (((mask) >> 0) & 0x1), \
+                                   4 + (((mask) >> 1) & 0x1), \
+                                   2 + (((mask) >> 2) & 0x1), \
+                                   6 + (((mask) >> 3) & 0x1)); })

 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@ -2814,7 +2817,7 @@ _mm256_castsi128_si256(__m128i __a)
 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
  (__m128)__builtin_shufflevector( \
    (__v8sf)(__m256)(V), \
-    (__v8sf)(_mm256_setzero_ps()), \
+    (__v8sf)(_mm256_undefined_ps()), \
    (((M) & 1) ? 4 : 0), \
    (((M) & 1) ? 5 : 1), \
    (((M) & 1) ? 6 : 2), \
@ -2823,14 +2826,14 @@ _mm256_castsi128_si256(__m128i __a)
 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
  (__m128d)__builtin_shufflevector( \
    (__v4df)(__m256d)(V), \
-    (__v4df)(_mm256_setzero_pd()), \
+    (__v4df)(_mm256_undefined_pd()), \
    (((M) & 1) ? 2 : 0), \
    (((M) & 1) ? 3 : 1) );})

 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
  (__m128i)__builtin_shufflevector( \
    (__v4di)(__m256i)(V), \
-    (__v4di)(_mm256_setzero_si256()), \
+    (__v4di)(_mm256_undefined_si256()), \
    (((M) & 1) ? 2 : 0), \
    (((M) & 1) ? 3 : 1) );})

--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@ -2300,25 +2300,25 @@ _mm_movemask_epi8(__m128i __a)

 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
-                                   (__v4si)_mm_setzero_si128(), \
-                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+                                   (__v4si)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })

 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_setzero_si128(), \
-                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   (__v8hi)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
                                   4, 5, 6, 7); })

 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_setzero_si128(), \
+                                   (__v8hi)_mm_undefined_si128(), \
                                   0, 1, 2, 3, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6)); })
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
@ -2406,7 +2406,8 @@ _mm_movemask_pd(__m128d __a)

 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
-                                   (i) & 1, (((i) & 2) >> 1) + 2); })
+                                   0 + (((i) >> 0) & 0x1), \
+                                   2 + (((i) >> 1) & 0x1)); })

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@ -2496,9 +2496,10 @@ _mm_setcsr(unsigned int __i)
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
-                                  (((mask) & 0x30) >> 4) + 4, \
-                                  (((mask) & 0xc0) >> 6) + 4); })
+                                  0 + (((mask) >> 0) & 0x3), \
+                                  0 + (((mask) >> 2) & 0x3), \
+                                  4 + (((mask) >> 4) & 0x3), \
+                                  4 + (((mask) >> 6) & 0x3)); })

 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x
--- a/clang/test/CodeGen/avx-builtins.c
+++ b/clang/test/CodeGen/avx-builtins.c
@ -346,19 +346,19 @@ long long test_mm256_extract_epi64(__m256i A) {

 __m128d test_mm256_extractf128_pd(__m256d A) {
  // CHECK-LABEL: test_mm256_extractf128_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
  return _mm256_extractf128_pd(A, 1);
 }

 __m128 test_mm256_extractf128_ps(__m256 A) {
  // CHECK-LABEL: test_mm256_extractf128_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  return _mm256_extractf128_ps(A, 1);
 }

 __m128i test_mm256_extractf128_si256(__m256i A) {
  // CHECK-LABEL: test_mm256_extractf128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
  return _mm256_extractf128_si256(A, 1);
 }

@ -647,32 +647,32 @@ __m256 test_mm256_or_ps(__m256 A, __m256 B) {

 __m128d test_mm_permute_pd(__m128d A) {
  // CHECK-LABEL: test_mm_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
  return _mm_permute_pd(A, 1);
 }

 __m256d test_mm256_permute_pd(__m256d A) {
  // CHECK-LABEL: test_mm256_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  return _mm256_permute_pd(A, 5);
 }

 __m128 test_mm_permute_ps(__m128 A) {
  // CHECK-LABEL: test_mm_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  return _mm_permute_ps(A, 0x1b);
 }

 // Test case for PR12401
 __m128 test2_mm_permute_ps(__m128 a) {
  // CHECK-LABEL: test2_mm_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
  return _mm_permute_ps(a, 0xe6);
 }

 __m256 test_mm256_permute_ps(__m256 A) {
  // CHECK-LABEL: test_mm256_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
  return _mm256_permute_ps(A, 0x1b);
 }

@ -1177,7 +1177,7 @@ void test_mm256_storeu2_m128(float* A, float* B, __m256 C) {
  // CHECK-LABEL: test_mm256_storeu2_m128
  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
  _mm256_storeu2_m128(A, B, C);
 }
@ -1186,7 +1186,7 @@ void test_mm256_storeu2_m128d(double* A, double* B, __m256d C) {
  // CHECK-LABEL: test_mm256_storeu2_m128d
  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
  _mm256_storeu2_m128d(A, B, C);
 }
@ -1195,7 +1195,7 @@ void test_mm256_storeu2_m128i(__m128i* A, __m128i* B, __m256i C) {
  // CHECK-LABEL: test_mm256_storeu2_m128i
  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
  _mm256_storeu2_m128i(A, B, C);
 }
--- a/clang/test/CodeGen/avx2-builtins.c
+++ b/clang/test/CodeGen/avx2-builtins.c
@ -370,20 +370,20 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {

 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
  // CHECK-LABEL: test0_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
  return _mm256_extracti128_si256(a, 0);
 }

 __m128i test1_mm256_extracti128_si256_1(__m256i a) {
  // CHECK-LABEL: test1_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
  return _mm256_extracti128_si256(a, 1);
 }

 // Immediate should be truncated to one bit.
 __m128i test2_mm256_extracti128_si256(__m256i a) {
  // CHECK-LABEL: test2_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
  return _mm256_extracti128_si256(a, 2);
 }

@ -891,13 +891,13 @@ __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {

 __m256i test_mm256_permute4x64_epi64(__m256i a) {
  // CHECK-LABEL: test_mm256_permute4x64_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
  return _mm256_permute4x64_epi64(a, 35);
 }

 __m256d test_mm256_permute4x64_pd(__m256d a) {
  // CHECK-LABEL: test_mm256_permute4x64_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
  return _mm256_permute4x64_pd(a, 25);
 }

--- a/clang/test/CodeGen/avx512f-builtins.c
+++ b/clang/test/CodeGen/avx512f-builtins.c
@ -3409,40 +3409,40 @@ __m512i test_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 _

 __m512d test_mm512_permute_pd(__m512d __X) {
  // CHECK-LABEL: @test_mm512_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
  return _mm512_permute_pd(__X, 2);
 }

 __m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) {
  // CHECK-LABEL: @test_mm512_mask_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
  return _mm512_mask_permute_pd(__W, __U, __X, 2);
 }

 __m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) {
  // CHECK-LABEL: @test_mm512_maskz_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
  return _mm512_maskz_permute_pd(__U, __X, 2);
 }

 __m512 test_mm512_permute_ps(__m512 __X) {
  // CHECK-LABEL: @test_mm512_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
  return _mm512_permute_ps(__X, 2);
 }

 __m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) {
  // CHECK-LABEL: @test_mm512_mask_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
  return _mm512_mask_permute_ps(__W, __U, __X, 2);
 }

 __m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
  // CHECK-LABEL: @test_mm512_maskz_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
  return _mm512_maskz_permute_ps(__U, __X, 2);
 }
--- a/clang/test/CodeGen/avx512vl-builtins.c
+++ b/clang/test/CodeGen/avx512vl-builtins.c
@ -4615,56 +4615,56 @@ __m256 test_mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) {

 __m128d test_mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X) {
  // CHECK-LABEL: @test_mm_mask_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
  return _mm_mask_permute_pd(__W, __U, __X, 1); 
 }

 __m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) {
  // CHECK-LABEL: @test_mm_maskz_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
  return _mm_maskz_permute_pd(__U, __X, 1); 
 }

 __m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) {
  // CHECK-LABEL: @test_mm256_mask_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
  return _mm256_mask_permute_pd(__W, __U, __X, 5); 
 }

 __m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) {
  // CHECK-LABEL: @test_mm256_maskz_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
  return _mm256_maskz_permute_pd(__U, __X, 5); 
 }

 __m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) {
  // CHECK-LABEL: @test_mm_mask_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
  return _mm_mask_permute_ps(__W, __U, __X, 0x1b); 
 }

 __m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) {
  // CHECK-LABEL: @test_mm_maskz_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
  return _mm_maskz_permute_ps(__U, __X, 0x1b); 
 }

 __m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) {
  // CHECK-LABEL: @test_mm256_mask_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
  return _mm256_mask_permute_ps(__W, __U, __X, 0x1b); 
 }

 __m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) {
  // CHECK-LABEL: @test_mm256_maskz_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
  return _mm256_maskz_permute_ps(__U, __X, 0x1b); 
 }