[X86] Add parentheses around casts in some of the X86 intrinsic headers.

This covers the SSE and AVX/AVX2 headers. AVX512 has a lot more macros due to rounding mode. Fixes part of PR51324. Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D107843
2021-08-13 09:22:43 -07:00 · 2021-08-13 09:22:43 -07:00 · 4190d99dfc
parent 17bc82dd3b
commit 4190d99dfc
8 changed files with 316 additions and 308 deletions
--- a/clang/lib/Headers/__wmmintrin_aes.h
+++ b/clang/lib/Headers/__wmmintrin_aes.h
@ -133,7 +133,7 @@ _mm_aesimc_si128(__m128i __V)
 ///    An 8-bit round constant used to generate the AES encryption key.
 /// \returns A 128-bit round key for AES encryption.
 #define _mm_aeskeygenassist_si128(C, R) \
-  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))

 #undef __DEFAULT_FN_ATTRS

--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@ -20,8 +20,8 @@

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 #define _mm256_mpsadbw_epu8(X, Y, M) \
-  (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
-                                     (__v32qi)(__m256i)(Y), (int)(M))
+  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                      (__v32qi)(__m256i)(Y), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi8(__m256i __a)
@ -114,8 +114,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
 }

 #define _mm256_alignr_epi8(a, b, n) \
-  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
-                                     (__v32qi)(__m256i)(b), (n))
+  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                      (__v32qi)(__m256i)(b), (n)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_and_si256(__m256i __a, __m256i __b)
@ -149,8 +149,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 }

 #define _mm256_blend_epi16(V1, V2, M) \
-  (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
-                                     (__v16hi)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+                                      (__v16hi)(__m256i)(V2), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@ -467,13 +467,13 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 }

 #define _mm256_shuffle_epi32(a, imm) \
-  (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))

 #define _mm256_shufflehi_epi16(a, imm) \
-  (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))

 #define _mm256_shufflelo_epi16(a, imm) \
-  (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sign_epi8(__m256i __a, __m256i __b)
@ -494,10 +494,10 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 }

 #define _mm256_slli_si256(a, imm) \
-  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))

 #define _mm256_bslli_epi128(a, imm) \
-  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_slli_epi16(__m256i __a, int __count)
@ -560,10 +560,10 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 }

 #define _mm256_srli_si256(a, imm) \
-  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))

 #define _mm256_bsrli_epi128(a, imm) \
-  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srli_epi16(__m256i __a, int __count)
@ -743,12 +743,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)

 #define _mm_blend_epi32(V1, V2, M) \
-  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
-                                     (__v4si)(__m128i)(V2), (int)(M))
+  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+                                      (__v4si)(__m128i)(V2), (int)(M)))

 #define _mm256_blend_epi32(V1, V2, M) \
-  (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
-                                     (__v8si)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+                                      (__v8si)(__m256i)(V2), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastb_epi8(__m128i __X)
@ -806,7 +806,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
 }

 #define _mm256_permute4x64_pd(V, M) \
-  (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
+  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))

 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@ -815,17 +815,17 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 }

 #define _mm256_permute4x64_epi64(V, M) \
-  (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
+  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))

 #define _mm256_permute2x128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))

 #define _mm256_extracti128_si256(V, M) \
-  (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
+  ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))

 #define _mm256_inserti128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
-                                        (__v2di)(__m128i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+                                         (__v2di)(__m128i)(V2), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskload_epi32(int const *__X, __m256i __M)
@ -936,211 +936,211 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 }

 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
-                                     (double const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s))
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                      (double const *)(m), \
+                                      (__v4si)(__m128i)(i), \
+                                      (__v2df)(__m128d)(mask), (s)))

 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
-                                        (double const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4df)(__m256d)(mask), (s))
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                         (double const *)(m), \
+                                         (__v4si)(__m128i)(i), \
+                                         (__v4df)(__m256d)(mask), (s)))

 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
-                                     (double const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s))
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                      (double const *)(m), \
+                                      (__v2di)(__m128i)(i), \
+                                      (__v2df)(__m128d)(mask), (s)))

 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
-                                        (double const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4df)(__m256d)(mask), (s))
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                         (double const *)(m), \
+                                         (__v4di)(__m256i)(i), \
+                                         (__v4df)(__m256d)(mask), (s)))

 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
-                                    (float const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s))
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                     (float const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4sf)(__m128)(mask), (s)))

 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
-                                       (float const *)(m), \
-                                       (__v8si)(__m256i)(i), \
-                                       (__v8sf)(__m256)(mask), (s))
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                        (float const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8sf)(__m256)(mask), (s)))

 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
-                                    (float const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s))
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                     (float const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4sf)(__m128)(mask), (s)))

 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
-                                       (float const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4sf)(__m128)(mask), (s))
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                        (float const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4sf)(__m128)(mask), (s)))

 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
-                                    (int const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                     (int const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4si)(__m128i)(mask), (s)))

 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
-                                       (int const *)(m), \
-                                       (__v8si)(__m256i)(i), \
-                                       (__v8si)(__m256i)(mask), (s))
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                        (int const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8si)(__m256i)(mask), (s)))

 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
-                                    (int const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                     (int const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4si)(__m128i)(mask), (s)))

 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
-                                       (int const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4si)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                        (int const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4si)(__m128i)(mask), (s)))

 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
-                                    (long long const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                     (long long const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2di)(__m128i)(mask), (s)))

 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
-                                       (long long const *)(m), \
-                                       (__v4si)(__m128i)(i), \
-                                       (__v4di)(__m256i)(mask), (s))
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                        (long long const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4di)(__m256i)(mask), (s)))

 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
-                                    (long long const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                     (long long const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2di)(__m128i)(mask), (s)))

 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
-                                       (long long const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4di)(__m256i)(mask), (s))
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                        (long long const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4di)(__m256i)(mask), (s)))

 #define _mm_i32gather_pd(m, i, s) \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
-                                     (double const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                          _mm_setzero_pd()), \
-                                     (s))
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                      (double const *)(m), \
+                                      (__v4si)(__m128i)(i), \
+                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                           _mm_setzero_pd()), \
+                                      (s)))

 #define _mm256_i32gather_pd(m, i, s) \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
-                                        (double const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                              _mm256_setzero_pd(), \
-                                                              _CMP_EQ_OQ), \
-                                        (s))
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                         (double const *)(m), \
+                                         (__v4si)(__m128i)(i), \
+                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                               _mm256_setzero_pd(), \
+                                                               _CMP_EQ_OQ), \
+                                         (s)))

 #define _mm_i64gather_pd(m, i, s) \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
-                                     (double const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                          _mm_setzero_pd()), \
-                                     (s))
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                      (double const *)(m), \
+                                      (__v2di)(__m128i)(i), \
+                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                           _mm_setzero_pd()), \
+                                      (s)))

 #define _mm256_i64gather_pd(m, i, s) \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
-                                        (double const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                              _mm256_setzero_pd(), \
-                                                              _CMP_EQ_OQ), \
-                                        (s))
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                         (double const *)(m), \
+                                         (__v4di)(__m256i)(i), \
+                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                               _mm256_setzero_pd(), \
+                                                               _CMP_EQ_OQ), \
+                                         (s)))

 #define _mm_i32gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
-                                    (float const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                         _mm_setzero_ps()), \
-                                    (s))
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                     (float const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                          _mm_setzero_ps()), \
+                                     (s)))

 #define _mm256_i32gather_ps(m, i, s) \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
-                                       (float const *)(m), \
-                                       (__v8si)(__m256i)(i), \
-                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
-                                                             _mm256_setzero_ps(), \
-                                                             _CMP_EQ_OQ), \
-                                       (s))
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                        (float const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                              _mm256_setzero_ps(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)))

 #define _mm_i64gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
-                                    (float const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                         _mm_setzero_ps()), \
-                                    (s))
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                     (float const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                          _mm_setzero_ps()), \
+                                     (s)))

 #define _mm256_i64gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
-                                       (float const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                            _mm_setzero_ps()), \
-                                       (s))
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                        (float const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                             _mm_setzero_ps()), \
+                                        (s)))

 #define _mm_i32gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
-                                    (int const *)(m), (__v4si)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s))
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                     (int const *)(m), (__v4si)(__m128i)(i), \
+                                     (__v4si)_mm_set1_epi32(-1), (s)))

 #define _mm256_i32gather_epi32(m, i, s) \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
-                                       (int const *)(m), (__v8si)(__m256i)(i), \
-                                       (__v8si)_mm256_set1_epi32(-1), (s))
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                        (int const *)(m), (__v8si)(__m256i)(i), \
+                                        (__v8si)_mm256_set1_epi32(-1), (s)))

 #define _mm_i64gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
-                                    (int const *)(m), (__v2di)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s))
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                     (int const *)(m), (__v2di)(__m128i)(i), \
+                                     (__v4si)_mm_set1_epi32(-1), (s)))

 #define _mm256_i64gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
-                                       (int const *)(m), (__v4di)(__m256i)(i), \
-                                       (__v4si)_mm_set1_epi32(-1), (s))
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                        (int const *)(m), (__v4di)(__m256i)(i), \
+                                        (__v4si)_mm_set1_epi32(-1), (s)))

 #define _mm_i32gather_epi64(m, i, s) \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
-                                    (long long const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s))
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                     (long long const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))

 #define _mm256_i32gather_epi64(m, i, s) \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
-                                       (long long const *)(m), \
-                                       (__v4si)(__m128i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s))
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                        (long long const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))

 #define _mm_i64gather_epi64(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
-                                    (long long const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s))
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                     (long long const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))

 #define _mm256_i64gather_epi64(m, i, s) \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
-                                       (long long const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s))
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                        (long long const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))

 #undef __DEFAULT_FN_ATTRS256
 #undef __DEFAULT_FN_ATTRS128
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@ -400,7 +400,7 @@ _mm256_rcp_ps(__m256 __a)
 ///      11: Truncated.
 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
 #define _mm256_round_pd(V, M) \
-    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
+  ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))

 /// Rounds the values stored in a 256-bit vector of [8 x float] as
 ///    specified by the byte operand. The source values are rounded to integer
@ -432,7 +432,7 @@ _mm256_rcp_ps(__m256 __a)
 ///      11: Truncated.
 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
 #define _mm256_round_ps(V, M) \
-  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
+  ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))

 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
 ///    source values are rounded up to integer values and returned as 64-bit
@ -989,7 +989,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) \
-  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
+  ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))

 /// Copies the values in a 256-bit vector of [4 x double] as specified by
 ///    the immediate integer operand.
@ -1029,7 +1029,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///         returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) \
-  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
+  ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))

 /// Copies the values in a 128-bit vector of [4 x float] as specified by
 ///    the immediate integer operand.
@ -1085,7 +1085,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) \
-  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
+  ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))

 /// Copies the values in a 256-bit vector of [8 x float] as specified by
 ///    the immediate integer operand.
@ -1177,7 +1177,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) \
-  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
+  ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))

 /// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [4 x double], as specified by the immediate integer operand.
@ -1217,8 +1217,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///          destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute2f128_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
-                                           (__v4df)(__m256d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                            (__v4df)(__m256d)(V2), (int)(M)))

 /// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [8 x float], as specified by the immediate integer operand.
@ -1258,8 +1258,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute2f128_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
-                                          (__v8sf)(__m256)(V2), (int)(M))
+  ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                           (__v8sf)(__m256)(V2), (int)(M)))

 /// Permutes 128-bit data values stored in two 256-bit integer vectors,
 ///    as specified by the immediate integer operand.
@ -1298,8 +1298,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    destination.
 /// \returns A 256-bit integer vector containing the copied values.
 #define _mm256_permute2f128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
-                                           (__v8si)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                            (__v8si)(__m256i)(V2), (int)(M)))

 /* Vector Blend */
 /// Merges 64-bit double-precision data values stored in either of the
@ -1327,8 +1327,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
-                                     (__v4df)(__m256d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+                                      (__v4df)(__m256d)(V2), (int)(M)))

 /// Merges 32-bit single-precision data values stored in either of the
 ///    two 256-bit vectors of [8 x float], as specified by the immediate
@ -1355,8 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_blend_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
-                                    (__v8sf)(__m256)(V2), (int)(M))
+  ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+                                     (__v8sf)(__m256)(V2), (int)(M)))

 /// Merges 64-bit double-precision data values stored in either of the
 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
@ -1453,8 +1453,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    two parallel dot product computations.
 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
 #define _mm256_dp_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
-                                 (__v8sf)(__m256)(V2), (M))
+  ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), (M)))

 /* Vector shuffle */
 /// Selects 8 float values from the 256-bit operands of [8 x float], as
@ -1507,8 +1507,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) \
-  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
-                                   (__v8sf)(__m256)(b), (int)(mask))
+  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
+                                    (__v8sf)(__m256)(b), (int)(mask)))

 /// Selects four double-precision values from the 256-bit operands of
 ///    [4 x double], as specified by the immediate value operand.
@ -1553,8 +1553,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) \
-  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
-                                    (__v4df)(__m256d)(b), (int)(mask))
+  ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
+                                     (__v4df)(__m256d)(b), (int)(mask)))

 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@ -1647,8 +1647,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) \
-  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c))
+  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))

 /// Compares each of the corresponding values of two 128-bit vectors of
 ///    [4 x float], using the operation specified by the immediate integer
@ -1707,8 +1707,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) \
-  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c))
+  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))

 /// Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
@ -1767,8 +1767,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) \
-  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
-                                   (__v4df)(__m256d)(b), (c))
+  ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                    (__v4df)(__m256d)(b), (c)))

 /// Compares each of the corresponding values of two 256-bit vectors of
 ///    [8 x float], using the operation specified by the immediate integer
@ -1827,8 +1827,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) \
-  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
-                                  (__v8sf)(__m256)(b), (c))
+  ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                   (__v8sf)(__m256)(b), (c)))

 /// Compares each of the corresponding scalar double-precision values of
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
@ -1886,8 +1886,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) \
-  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c))
+  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))

 /// Compares each of the corresponding scalar values of two 128-bit
 ///    vectors of [4 x float], using the operation specified by the immediate
@ -1945,8 +1945,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) \
-  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c))
+  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))

 /// Takes a [8 x i32] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -1964,7 +1964,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
 ///    packed data.
 #define _mm256_extract_epi32(X, N) \
-  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
+  ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))

 /// Takes a [16 x i16] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -1982,8 +1982,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
 ///    packed data.
 #define _mm256_extract_epi16(X, N) \
-  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
-                                                    (int)(N))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
+                                                     (int)(N)))

 /// Takes a [32 x i8] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -2001,8 +2001,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
 ///    packed data.
 #define _mm256_extract_epi8(X, N) \
-  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
-                                                   (int)(N))
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
+                                                    (int)(N)))

 #ifdef __x86_64__
 /// Takes a [4 x i64] vector and returns the vector element value
@ -2021,7 +2021,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
 ///    packed data.
 #define _mm256_extract_epi64(X, N) \
-  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
+  ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
 #endif

 /// Takes a [8 x i32] vector and replaces the vector element value
@ -2043,8 +2043,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi32(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
-                                       (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
+                                        (int)(I), (int)(N)))


 /// Takes a [16 x i16] vector and replaces the vector element value
@ -2066,8 +2066,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi16(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
+                                         (int)(I), (int)(N)))

 /// Takes a [32 x i8] vector and replaces the vector element value
 ///    indexed by the immediate constant operand with a new value. Returns the
@ -2088,8 +2088,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi8(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
+                                         (int)(I), (int)(N)))

 #ifdef __x86_64__
 /// Takes a [4 x i64] vector and replaces the vector element value
@ -2111,8 +2111,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///     \a __imm with \a __b.
 #define _mm256_insert_epi64(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
-                                       (long long)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
+                                        (long long)(I), (int)(N)))
 #endif

 /* Conversion */
@ -4592,8 +4592,8 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    result.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 #define _mm256_insertf128_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
-                                           (__v4sf)(__m128)(V2), (int)(M))
+  ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+                                            (__v4sf)(__m128)(V2), (int)(M)))

 /// Constructs a new 256-bit vector of [4 x double] by first duplicating
 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
@ -4630,8 +4630,8 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    result.
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 #define _mm256_insertf128_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
-                                            (__v2df)(__m128d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+                                             (__v2df)(__m128d)(V2), (int)(M)))

 /// Constructs a new 256-bit integer vector by first duplicating a
 ///    256-bit integer vector given in the first parameter, and then replacing
@ -4668,8 +4668,8 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    result.
 /// \returns A 256-bit integer vector containing the interleaved values.
 #define _mm256_insertf128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
-                                            (__v4si)(__m128i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
+                                             (__v4si)(__m128i)(V2), (int)(M)))

 /*
   Vector extract.
@ -4698,7 +4698,7 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
 #define _mm256_extractf128_ps(V, M) \
-  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
+  ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))

 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
 ///    of [4 x double], as determined by the immediate integer parameter, and
@ -4722,7 +4722,7 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
 #define _mm256_extractf128_pd(V, M) \
-  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
+  ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))

 /// Extracts either the upper or the lower 128 bits from a 256-bit
 ///    integer vector, as determined by the immediate integer parameter, and
@ -4746,7 +4746,7 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit integer vector containing the extracted bits.
 #define _mm256_extractf128_si256(V, M) \
-  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
+  ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))

 /* SIMD load ops (unaligned) */
 /// Loads two 128-bit floating-point vectors of [4 x float] from
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@ -2818,10 +2818,10 @@ _mm_xor_si128(__m128i __a, __m128i __b)
 ///    \a a.
 /// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm) \
-  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 #define _mm_bslli_si128(a, imm) \
-  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
@ -3035,10 +3035,10 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
 ///    \a a.
 /// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm) \
-  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 #define _mm_bsrli_si128(a, imm) \
-  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 /// Right-shifts each of 16-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
@ -4356,8 +4356,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi16(a, imm) \
-  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
-                                                   (int)(imm))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
+                                                    (int)(imm)))

 /// Constructs a 128-bit integer vector by first making a copy of the
 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
@ -4380,8 +4380,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
 ///    lower 16 bits of \a __b are written.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi16(a, b, imm) \
-  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
-                                       (int)(imm))
+  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
+                                        (int)(imm)))

 /// Copies the values of the most significant bits from each 8-bit
 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
@ -4430,7 +4430,7 @@ _mm_movemask_epi8(__m128i __a)
 ///    11: assign values from bits [127:96] of \a a.
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) \
-  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))

 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4460,7 +4460,7 @@ _mm_movemask_epi8(__m128i __a)
 ///    11: assign values from bits [63:48] of \a a. \n
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) \
-  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))

 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4490,7 +4490,7 @@ _mm_movemask_epi8(__m128i __a)
 ///    11: assign values from bits [127:112] of \a a. \n
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) \
-  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))

 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
@ -4844,8 +4844,8 @@ _mm_movemask_pd(__m128d __a)
 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) \
-  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
-                                 (int)(i))
+  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                  (int)(i)))

 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 ///    floating-point vector of [4 x float].
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@ -231,7 +231,7 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_round_ps(X, M) \
-  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
+  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))

 /// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
@ -272,8 +272,8 @@
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
 #define _mm_round_ss(X, Y, M) \
-  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                 (__v4sf)(__m128)(Y), (M))
+  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                  (__v4sf)(__m128)(Y), (M)))

 /// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
@ -306,7 +306,7 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_round_pd(X, M) \
-  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
+  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))

 /// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
@ -347,8 +347,8 @@
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
 #define _mm_round_sd(X, Y, M) \
-  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                  (__v2df)(__m128d)(Y), (M))
+  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                   (__v2df)(__m128d)(Y), (M)))

 /* SSE4 Packed Blending Intrinsics.  */
 /// Returns a 128-bit vector of [2 x double] where the values are
@ -376,8 +376,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M) \
-  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
-                                    (__v2df)(__m128d)(V2), (int)(M))
+  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                     (__v2df)(__m128d)(V2), (int)(M)))

 /// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
@ -404,8 +404,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M) \
-  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
-                                   (__v4sf)(__m128)(V2), (int)(M))
+  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                    (__v4sf)(__m128)(V2), (int)(M)))

 /// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
@ -513,8 +513,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M) \
-  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
-                                       (__v8hi)(__m128i)(V2), (int)(M))
+  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                        (__v8hi)(__m128i)(V2), (int)(M)))

 /* SSE4 Dword Multiply Instructions.  */
 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@ -590,8 +590,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
 #define _mm_dp_ps(X, Y, M) \
-  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                               (__v4sf)(__m128)(Y), (M))
+  ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                                (__v4sf)(__m128)(Y), (M)))

 /// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
@ -625,8 +625,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M) \
-  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                (__v2df)(__m128d)(Y), (M))
+  ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                 (__v2df)(__m128d)(Y), (M)))

 /* SSE4 Streaming Load Hint Instruction.  */
 /// Loads integer values from a 128-bit aligned memory location to a
@ -925,8 +925,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi8(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+                                         (int)(I), (int)(N)))

 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
@ -957,8 +957,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi32(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
-                                       (int)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+                                        (int)(I), (int)(N)))

 #ifdef __x86_64__
 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@ -988,8 +988,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi64(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
-                                       (long long)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+                                        (long long)(I), (int)(N)))
 #endif /* __x86_64__ */

 /* Extract int from packed integer array at index.  This returns the element
@ -1031,8 +1031,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    128-bit integer vector parameter and the remaining bits are assigned
 ///    zeros.
 #define _mm_extract_epi8(X, N) \
-  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
-                                                   (int)(N))
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+                                                    (int)(N)))

 /// Extracts a 32-bit element from the 128-bit integer vector of
 ///    [4 x i32], using the immediate value parameter \a N as a selector.
@ -1057,7 +1057,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi32(X, N) \
-  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
+  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))

 #ifdef __x86_64__
 /// Extracts a 64-bit element from the 128-bit integer vector of
@ -1080,7 +1080,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1: Bits [127:64] are returned. \n
 /// \returns  A 64-bit integer.
 #define _mm_extract_epi64(X, N) \
-  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
+  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
 #endif /* __x86_64 */

 /* SSE4 128-bit Packed Integer Comparisons.  */
@ -1514,8 +1514,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
 #define _mm_mpsadbw_epu8(X, Y, M) \
-  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                      (__v16qi)(__m128i)(Y), (M))
+  ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                       (__v16qi)(__m128i)(Y), (M)))

 /// Finds the minimum unsigned 16-bit element in the input 128-bit
 ///    vector of [8 x u16] and returns it and along with its index.
@ -1624,8 +1624,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
 #define _mm_cmpistrm(A, B, M) \
-  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
-                                       (__v16qi)(__m128i)(B), (int)(M))
+  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                        (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1678,8 +1678,8 @@ _mm_minpos_epu16(__m128i __V)
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpistri(A, B, M) \
-  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
-                                   (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -1738,9 +1738,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
 #define _mm_cmpestrm(A, LA, B, LB, M) \
-  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
-                                       (__v16qi)(__m128i)(B), (int)(LB), \
-                                       (int)(M))
+  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                        (__v16qi)(__m128i)(B), (int)(LB), \
+                                        (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -1797,9 +1797,9 @@ _mm_minpos_epu16(__m128i __V)
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpestri(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
-                                   (__v16qi)(__m128i)(B), (int)(LB), \
-                                   (int)(M))
+  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M)))

 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
 /// Uses the immediate operand \a M to perform a comparison of string
@ -1849,8 +1849,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum; otherwise, returns 0.
 #define _mm_cmpistra(A, B, M) \
-  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1898,8 +1898,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
 #define _mm_cmpistrc(A, B, M) \
-  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1946,8 +1946,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpistro(A, B, M) \
-  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1996,8 +1996,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpistrs(A, B, M) \
-  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -2046,8 +2046,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpistrz(A, B, M) \
-  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2100,9 +2100,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum, otherwise, returns 0.
 #define _mm_cmpestra(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2154,9 +2154,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
 #define _mm_cmpestrc(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2207,9 +2207,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpestro(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2262,9 +2262,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpestrs(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2316,9 +2316,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpestrz(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))

 /* SSE4.2 Compare Packed Data -- Greater Than.  */
 /// Compares each of the corresponding 64-bit values of the 128-bit
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@ -145,8 +145,8 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_epi8(a, b, n) \
-  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
-                                     (__v16qi)(__m128i)(b), (n))
+  ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                      (__v16qi)(__m128i)(b), (n)))

 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 ///    the result by the number of bytes specified in the immediate operand.
@ -168,7 +168,7 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
+  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))

 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@ -2181,7 +2181,7 @@ void _mm_sfence(void);
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
+  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))

 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2212,7 +2212,7 @@ void _mm_sfence(void);
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
 #define _mm_insert_pi16(a, d, n) \
-  (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
+  ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))

 /// Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the greater value to the
@ -2359,7 +2359,7 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
-  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
+  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))

 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@ -2601,8 +2601,8 @@ void _mm_setcsr(unsigned int __i);
 ///    11: Bits [127:96] copied from the specified operand.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) \
-  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                (int)(mask))
+  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                 (int)(mask)))

 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@ -393,3 +393,11 @@ int test_mm_testz_si128(__m128i x, __m128i y) {
  // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_testz_si128(x, y);
 }
+
+// Make sure brackets work after macro intrinsics.
+float pr51324(__m128 a) {
+  // CHECK-LABEL: pr51324
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  return _mm_round_ps(a, 0)[0];
+}