[X86] Add parentheses around casts in some of the X86 intrinsic headers.

This covers the SSE and AVX/AVX2 headers. AVX512 has a lot more macros
due to rounding mode.

Fixes part of PR51324.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D107843
This commit is contained in:
Craig Topper 2021-08-13 09:22:43 -07:00
parent 17bc82dd3b
commit 4190d99dfc
8 changed files with 316 additions and 308 deletions

View File

@ -133,7 +133,7 @@ _mm_aesimc_si128(__m128i __V)
/// An 8-bit round constant used to generate the AES encryption key.
/// \returns A 128-bit round key for AES encryption.
#define _mm_aeskeygenassist_si128(C, R) \
(__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
#undef __DEFAULT_FN_ATTRS

View File

@ -20,8 +20,8 @@
/* SSE4 Multiple Packed Sums of Absolute Difference. */
#define _mm256_mpsadbw_epu8(X, Y, M) \
(__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
(__v32qi)(__m256i)(Y), (int)(M))
((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
(__v32qi)(__m256i)(Y), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi8(__m256i __a)
@ -114,8 +114,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
}
#define _mm256_alignr_epi8(a, b, n) \
(__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (n))
((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (n)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_and_si256(__m256i __a, __m256i __b)
@ -149,8 +149,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
}
#define _mm256_blend_epi16(V1, V2, M) \
(__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
(__v16hi)(__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
(__v16hi)(__m256i)(V2), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@ -467,13 +467,13 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
}
#define _mm256_shuffle_epi32(a, imm) \
(__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
#define _mm256_shufflehi_epi16(a, imm) \
(__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
#define _mm256_shufflelo_epi16(a, imm) \
(__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sign_epi8(__m256i __a, __m256i __b)
@ -494,10 +494,10 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
}
#define _mm256_slli_si256(a, imm) \
(__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
#define _mm256_bslli_epi128(a, imm) \
(__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_slli_epi16(__m256i __a, int __count)
@ -560,10 +560,10 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
}
#define _mm256_srli_si256(a, imm) \
(__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
#define _mm256_bsrli_epi128(a, imm) \
(__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srli_epi16(__m256i __a, int __count)
@ -743,12 +743,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
#define _mm_blend_epi32(V1, V2, M) \
(__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
(__v4si)(__m128i)(V2), (int)(M))
((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
(__v4si)(__m128i)(V2), (int)(M)))
#define _mm256_blend_epi32(V1, V2, M) \
(__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastb_epi8(__m128i __X)
@ -806,7 +806,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
}
#define _mm256_permute4x64_pd(V, M) \
(__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@ -815,17 +815,17 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
}
#define _mm256_permute4x64_epi64(V, M) \
(__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
#define _mm256_permute2x128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
#define _mm256_extracti128_si256(V, M) \
(__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
#define _mm256_inserti128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
(__v2di)(__m128i)(V2), (int)(M))
((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
(__v2di)(__m128i)(V2), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskload_epi32(int const *__X, __m256i __M)
@ -936,211 +936,211 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
}
#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
(__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s))
((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s)))
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)(__m256d)(mask), (s))
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)(__m256d)(mask), (s)))
#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
(__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s))
((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s)))
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)(__m256d)(mask), (s))
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)(__m256d)(mask), (s)))
#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s))
((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s)))
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)(__m256)(mask), (s))
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)(__m256)(mask), (s)))
#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s))
((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s)))
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)(__m128)(mask), (s))
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)(__m128)(mask), (s)))
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4si)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s))
((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4si)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s)))
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
(int const *)(m), \
(__v8si)(__m256i)(i), \
(__v8si)(__m256i)(mask), (s))
((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
(int const *)(m), \
(__v8si)(__m256i)(i), \
(__v8si)(__m256i)(mask), (s)))
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v2di)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s))
((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v2di)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s)))
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4di)(__m256i)(i), \
(__v4si)(__m128i)(mask), (s))
((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4di)(__m256i)(i), \
(__v4si)(__m128i)(mask), (s)))
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s))
((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s)))
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)(__m256i)(mask), (s))
((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)(__m256i)(mask), (s)))
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s))
((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s)))
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)(__m256i)(mask), (s))
((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)(__m256i)(mask), (s)))
#define _mm_i32gather_pd(m, i, s) \
(__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \
(s))
((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \
(s)))
#define _mm256_i32gather_pd(m, i, s) \
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \
_CMP_EQ_OQ), \
(s))
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \
_CMP_EQ_OQ), \
(s)))
#define _mm_i64gather_pd(m, i, s) \
(__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \
(s))
((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \
(s)))
#define _mm256_i64gather_pd(m, i, s) \
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \
_CMP_EQ_OQ), \
(s))
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \
_CMP_EQ_OQ), \
(s)))
#define _mm_i32gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s))
((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s)))
#define _mm256_i32gather_ps(m, i, s) \
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
_mm256_setzero_ps(), \
_CMP_EQ_OQ), \
(s))
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
_mm256_setzero_ps(), \
_CMP_EQ_OQ), \
(s)))
#define _mm_i64gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s))
((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s)))
#define _mm256_i64gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s))
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s)))
#define _mm_i32gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4si)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s))
((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4si)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)))
#define _mm256_i32gather_epi32(m, i, s) \
(__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
(int const *)(m), (__v8si)(__m256i)(i), \
(__v8si)_mm256_set1_epi32(-1), (s))
((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
(int const *)(m), (__v8si)(__m256i)(i), \
(__v8si)_mm256_set1_epi32(-1), (s)))
#define _mm_i64gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v2di)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s))
((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v2di)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)))
#define _mm256_i64gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4di)(__m256i)(i), \
(__v4si)_mm_set1_epi32(-1), (s))
((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4di)(__m256i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)))
#define _mm_i32gather_epi64(m, i, s) \
(__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s))
((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s)))
#define _mm256_i32gather_epi64(m, i, s) \
(__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s))
((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s)))
#define _mm_i64gather_epi64(m, i, s) \
(__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s))
((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s)))
#define _mm256_i64gather_epi64(m, i, s) \
(__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s))
((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s)))
#undef __DEFAULT_FN_ATTRS256
#undef __DEFAULT_FN_ATTRS128

View File

@ -400,7 +400,7 @@ _mm256_rcp_ps(__m256 __a)
/// 11: Truncated.
/// \returns A 256-bit vector of [4 x double] containing the rounded values.
#define _mm256_round_pd(V, M) \
(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
/// Rounds the values stored in a 256-bit vector of [8 x float] as
/// specified by the byte operand. The source values are rounded to integer
@ -432,7 +432,7 @@ _mm256_rcp_ps(__m256 __a)
/// 11: Truncated.
/// \returns A 256-bit vector of [8 x float] containing the rounded values.
#define _mm256_round_ps(V, M) \
(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
/// source values are rounded up to integer values and returned as 64-bit
@ -989,7 +989,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_permute_pd(A, C) \
(__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
/// Copies the values in a 256-bit vector of [4 x double] as specified by
/// the immediate integer operand.
@ -1029,7 +1029,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute_pd(A, C) \
(__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
/// Copies the values in a 128-bit vector of [4 x float] as specified by
/// the immediate integer operand.
@ -1085,7 +1085,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_permute_ps(A, C) \
(__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
/// Copies the values in a 256-bit vector of [8 x float] as specified by
/// the immediate integer operand.
@ -1177,7 +1177,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute_ps(A, C) \
(__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
/// Permutes 128-bit data values stored in two 256-bit vectors of
/// [4 x double], as specified by the immediate integer operand.
@ -1217,8 +1217,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute2f128_pd(V1, V2, M) \
(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M))
((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M)))
/// Permutes 128-bit data values stored in two 256-bit vectors of
/// [8 x float], as specified by the immediate integer operand.
@ -1258,8 +1258,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute2f128_ps(V1, V2, M) \
(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M))
((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M)))
/// Permutes 128-bit data values stored in two 256-bit integer vectors,
/// as specified by the immediate integer operand.
@ -1298,8 +1298,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination.
/// \returns A 256-bit integer vector containing the copied values.
#define _mm256_permute2f128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M)))
/* Vector Blend */
/// Merges 64-bit double-precision data values stored in either of the
@ -1327,8 +1327,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_blend_pd(V1, V2, M) \
(__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M))
((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M)))
/// Merges 32-bit single-precision data values stored in either of the
/// two 256-bit vectors of [8 x float], as specified by the immediate
@ -1355,8 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_blend_ps(V1, V2, M) \
(__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M))
((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M)))
/// Merges 64-bit double-precision data values stored in either of the
/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
@ -1453,8 +1453,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// two parallel dot product computations.
/// \returns A 256-bit vector of [8 x float] containing the two dot products.
#define _mm256_dp_ps(V1, V2, M) \
(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (M))
((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (M)))
/* Vector shuffle */
/// Selects 8 float values from the 256-bit operands of [8 x float], as
@ -1507,8 +1507,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
#define _mm256_shuffle_ps(a, b, mask) \
(__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (int)(mask))
((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (int)(mask)))
/// Selects four double-precision values from the 256-bit operands of
/// [4 x double], as specified by the immediate value operand.
@ -1553,8 +1553,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
#define _mm256_shuffle_pd(a, b, mask) \
(__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (int)(mask))
((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (int)(mask)))
/* Compare */
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
@ -1647,8 +1647,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_pd(a, b, c) \
(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c))
((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c)))
/// Compares each of the corresponding values of two 128-bit vectors of
/// [4 x float], using the operation specified by the immediate integer
@ -1707,8 +1707,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ps(a, b, c) \
(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c))
((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c)))
/// Compares each of the corresponding double-precision values of two
/// 256-bit vectors of [4 x double], using the operation specified by the
@ -1767,8 +1767,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
#define _mm256_cmp_pd(a, b, c) \
(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (c))
((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (c)))
/// Compares each of the corresponding values of two 256-bit vectors of
/// [8 x float], using the operation specified by the immediate integer
@ -1827,8 +1827,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
#define _mm256_cmp_ps(a, b, c) \
(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (c))
((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (c)))
/// Compares each of the corresponding scalar double-precision values of
/// two 128-bit vectors of [2 x double], using the operation specified by the
@ -1886,8 +1886,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_sd(a, b, c) \
(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c))
((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c)))
/// Compares each of the corresponding scalar values of two 128-bit
/// vectors of [4 x float], using the operation specified by the immediate
@ -1945,8 +1945,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ss(a, b, c) \
(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c))
((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c)))
/// Takes a [8 x i32] vector and returns the vector element value
/// indexed by the immediate constant operand.
@ -1964,7 +1964,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 32-bit integer containing the extracted 32 bits of extended
/// packed data.
#define _mm256_extract_epi32(X, N) \
(int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
/// Takes a [16 x i16] vector and returns the vector element value
/// indexed by the immediate constant operand.
@ -1982,8 +1982,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
/// packed data.
#define _mm256_extract_epi16(X, N) \
(int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
(int)(N))
((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
(int)(N)))
/// Takes a [32 x i8] vector and returns the vector element value
/// indexed by the immediate constant operand.
@ -2001,8 +2001,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
/// packed data.
#define _mm256_extract_epi8(X, N) \
(int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
(int)(N))
((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
(int)(N)))
#ifdef __x86_64__
/// Takes a [4 x i64] vector and returns the vector element value
@ -2021,7 +2021,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 64-bit integer containing the extracted 64 bits of extended
/// packed data.
#define _mm256_extract_epi64(X, N) \
(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
#endif
/// Takes a [8 x i32] vector and replaces the vector element value
@ -2043,8 +2043,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi32(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
(int)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
(int)(I), (int)(N)))
/// Takes a [16 x i16] vector and replaces the vector element value
@ -2066,8 +2066,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi16(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
(int)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
(int)(I), (int)(N)))
/// Takes a [32 x i8] vector and replaces the vector element value
/// indexed by the immediate constant operand with a new value. Returns the
@ -2088,8 +2088,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi8(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
(int)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
(int)(I), (int)(N)))
#ifdef __x86_64__
/// Takes a [4 x i64] vector and replaces the vector element value
@ -2111,8 +2111,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi64(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
(long long)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
(long long)(I), (int)(N)))
#endif
/* Conversion */
@ -4592,8 +4592,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
#define _mm256_insertf128_ps(V1, V2, M) \
(__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
(__v4sf)(__m128)(V2), (int)(M))
((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
(__v4sf)(__m128)(V2), (int)(M)))
/// Constructs a new 256-bit vector of [4 x double] by first duplicating
/// a 256-bit vector of [4 x double] given in the first parameter, and then
@ -4630,8 +4630,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
#define _mm256_insertf128_pd(V1, V2, M) \
(__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
(__v2df)(__m128d)(V2), (int)(M))
((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
(__v2df)(__m128d)(V2), (int)(M)))
/// Constructs a new 256-bit integer vector by first duplicating a
/// 256-bit integer vector given in the first parameter, and then replacing
@ -4668,8 +4668,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit integer vector containing the interleaved values.
#define _mm256_insertf128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
(__v4si)(__m128i)(V2), (int)(M))
((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
(__v4si)(__m128i)(V2), (int)(M)))
/*
Vector extract.
@ -4698,7 +4698,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
#define _mm256_extractf128_ps(V, M) \
(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
/// Extracts either the upper or the lower 128 bits from a 256-bit vector
/// of [4 x double], as determined by the immediate integer parameter, and
@ -4722,7 +4722,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
#define _mm256_extractf128_pd(V, M) \
(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
/// Extracts either the upper or the lower 128 bits from a 256-bit
/// integer vector, as determined by the immediate integer parameter, and
@ -4746,7 +4746,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit integer vector containing the extracted bits.
#define _mm256_extractf128_si256(V, M) \
(__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
/* SIMD load ops (unaligned) */
/// Loads two 128-bit floating-point vectors of [4 x float] from

View File

@ -2818,10 +2818,10 @@ _mm_xor_si128(__m128i __a, __m128i __b)
/// \a a.
/// \returns A 128-bit integer vector containing the left-shifted value.
#define _mm_slli_si128(a, imm) \
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
#define _mm_bslli_si128(a, imm) \
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
/// by the specified number of bits. Low-order bits are cleared.
@ -3035,10 +3035,10 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
/// \a a.
/// \returns A 128-bit integer vector containing the right-shifted value.
#define _mm_srli_si128(a, imm) \
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
#define _mm_bsrli_si128(a, imm) \
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
/// Right-shifts each of 16-bit values in the 128-bit integer vector
/// operand by the specified number of bits. High-order bits are cleared.
@ -4356,8 +4356,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
/// \returns An integer, whose lower 16 bits are selected from the 128-bit
/// integer vector parameter and the remaining bits are assigned zeros.
#define _mm_extract_epi16(a, imm) \
(int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
(int)(imm))
((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
(int)(imm)))
/// Constructs a 128-bit integer vector by first making a copy of the
/// 128-bit integer vector parameter, and then inserting the lower 16 bits
@ -4380,8 +4380,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
/// lower 16 bits of \a __b are written.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi16(a, b, imm) \
(__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
(int)(imm))
((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
(int)(imm)))
/// Copies the values of the most significant bits from each 8-bit
/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
@ -4430,7 +4430,7 @@ _mm_movemask_epi8(__m128i __a)
/// 11: assign values from bits [127:96] of \a a.
/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shuffle_epi32(a, imm) \
(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4460,7 +4460,7 @@ _mm_movemask_epi8(__m128i __a)
/// 11: assign values from bits [63:48] of \a a. \n
/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflelo_epi16(a, imm) \
(__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4490,7 +4490,7 @@ _mm_movemask_epi8(__m128i __a)
/// 11: assign values from bits [127:112] of \a a. \n
/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflehi_epi16(a, imm) \
(__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
@ -4844,8 +4844,8 @@ _mm_movemask_pd(__m128d __a)
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
#define _mm_shuffle_pd(a, b, i) \
(__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
(int)(i))
((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
(int)(i)))
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
/// floating-point vector of [4 x float].

View File

@ -231,7 +231,7 @@
/// 11: Truncated
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
#define _mm_round_ps(X, M) \
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
/// Copies three upper elements of the first 128-bit vector operand to
/// the corresponding three upper elements of the 128-bit result vector of
@ -272,8 +272,8 @@
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
/// values.
#define _mm_round_ss(X, Y, M) \
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M))
((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)))
/// Rounds each element of the 128-bit vector of [2 x double] to an
/// integer value according to the rounding control specified by the second
@ -306,7 +306,7 @@
/// 11: Truncated
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
#define _mm_round_pd(X, M) \
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
/// Copies the upper element of the first 128-bit vector operand to the
/// corresponding upper element of the 128-bit result vector of [2 x double].
@ -347,8 +347,8 @@
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
/// values.
#define _mm_round_sd(X, Y, M) \
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M))
((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)))
/* SSE4 Packed Blending Intrinsics. */
/// Returns a 128-bit vector of [2 x double] where the values are
@ -376,8 +376,8 @@
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_blend_pd(V1, V2, M) \
(__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), (int)(M))
((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), (int)(M)))
/// Returns a 128-bit vector of [4 x float] where the values are selected
/// from either the first or second operand as specified by the third
@ -404,8 +404,8 @@
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_blend_ps(V1, V2, M) \
(__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
(__v4sf)(__m128)(V2), (int)(M))
((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
(__v4sf)(__m128)(V2), (int)(M)))
/// Returns a 128-bit vector of [2 x double] where the values are
/// selected from either the first or second operand as specified by the
@ -513,8 +513,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
#define _mm_blend_epi16(V1, V2, M) \
(__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), (int)(M))
((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), (int)(M)))
/* SSE4 Dword Multiply Instructions. */
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@ -590,8 +590,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// in the corresponding element; otherwise that element is set to zero.
/// \returns A 128-bit vector of [4 x float] containing the dot product.
#define _mm_dp_ps(X, Y, M) \
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M))
((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)))
/// Computes the dot product of the two 128-bit vectors of [2 x double]
/// and returns it in the elements of the 128-bit result vector of
@ -625,8 +625,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// each [2 x double] vector. If a bit is set, the dot product is returned in
/// the corresponding element; otherwise that element is set to zero.
#define _mm_dp_pd(X, Y, M) \
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M))
((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)))
/* SSE4 Streaming Load Hint Instruction. */
/// Loads integer values from a 128-bit aligned memory location to a
@ -925,8 +925,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1111: Bits [127:120] of the result are used for insertion.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi8(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
(int)(I), (int)(N))
((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
(int)(I), (int)(N)))
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
/// the 128-bit integer vector parameter, and then inserting the 32-bit
@ -957,8 +957,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 11: Bits [127:96] of the result are used for insertion.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi32(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
(int)(I), (int)(N))
((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
(int)(I), (int)(N)))
#ifdef __x86_64__
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@ -988,8 +988,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1: Bits [127:64] of the result are used for insertion. \n
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi64(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
(long long)(I), (int)(N))
((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
(long long)(I), (int)(N)))
#endif /* __x86_64__ */
/* Extract int from packed integer array at index. This returns the element
@ -1031,8 +1031,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 128-bit integer vector parameter and the remaining bits are assigned
/// zeros.
#define _mm_extract_epi8(X, N) \
(int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
(int)(N))
((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
(int)(N)))
/// Extracts a 32-bit element from the 128-bit integer vector of
/// [4 x i32], using the immediate value parameter \a N as a selector.
@ -1057,7 +1057,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// \returns An integer, whose lower 32 bits are selected from the 128-bit
/// integer vector parameter and the remaining bits are assigned zeros.
#define _mm_extract_epi32(X, N) \
(int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
#ifdef __x86_64__
/// Extracts a 64-bit element from the 128-bit integer vector of
@ -1080,7 +1080,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1: Bits [127:64] are returned. \n
/// \returns A 64-bit integer.
#define _mm_extract_epi64(X, N) \
(long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
#endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */
@ -1514,8 +1514,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
/// \returns A 128-bit integer vector containing the sums of the sets of
/// absolute differences between both operands.
#define _mm_mpsadbw_epu8(X, Y, M) \
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M))
((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M)))
/// Finds the minimum unsigned 16-bit element in the input 128-bit
/// vector of [8 x u16] and returns it and along with its index.
@ -1624,8 +1624,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns a 128-bit integer vector representing the result mask of
/// the comparison.
#define _mm_cmpistrm(A, B, M) \
(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1678,8 +1678,8 @@ _mm_minpos_epu16(__m128i __V)
/// 1: The index of the most significant set bit. \n
/// \returns Returns an integer representing the result index of the comparison.
#define _mm_cmpistri(A, B, M) \
(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -1738,9 +1738,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns a 128-bit integer vector representing the result mask of
/// the comparison.
#define _mm_cmpestrm(A, LA, B, LB, M) \
(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -1797,9 +1797,9 @@ _mm_minpos_epu16(__m128i __V)
/// 1: The index of the most significant set bit. \n
/// \returns Returns an integer representing the result index of the comparison.
#define _mm_cmpestri(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
/// Uses the immediate operand \a M to perform a comparison of string
@ -1849,8 +1849,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the bit mask is zero and the length of the string in
/// \a B is the maximum; otherwise, returns 0.
#define _mm_cmpistra(A, B, M) \
(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1898,8 +1898,8 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B.
/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
#define _mm_cmpistrc(A, B, M) \
(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1946,8 +1946,8 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B. \n
/// \returns Returns bit 0 of the resulting bit mask.
#define _mm_cmpistro(A, B, M) \
(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1996,8 +1996,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a A is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpistrs(A, B, M) \
(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -2046,8 +2046,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a B is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpistrz(A, B, M) \
(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2100,9 +2100,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the bit mask is zero and the length of the string in
/// \a B is the maximum, otherwise, returns 0.
#define _mm_cmpestra(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2154,9 +2154,9 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B. \n
/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
#define _mm_cmpestrc(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2207,9 +2207,9 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B.
/// \returns Returns bit 0 of the resulting bit mask.
#define _mm_cmpestro(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2262,9 +2262,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a A is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpestrs(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2316,9 +2316,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a B is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpestrz(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M)))
/* SSE4.2 Compare Packed Data -- Greater Than. */
/// Compares each of the corresponding 64-bit values of the 128-bit

View File

@ -145,8 +145,8 @@ _mm_abs_epi32(__m128i __a)
/// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_epi8(a, b, n) \
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n))
((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n)))
/// Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand.
@ -168,7 +168,7 @@ _mm_abs_epi32(__m128i __a)
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) \
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].

View File

@ -2181,7 +2181,7 @@ void _mm_sfence(void);
/// 3: Bits [63:48] are copied to the destination.
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
#define _mm_extract_pi16(a, n) \
(int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2212,7 +2212,7 @@ void _mm_sfence(void);
/// \returns A 64-bit integer vector containing the copied packed data from the
/// operands.
#define _mm_insert_pi16(a, d, n) \
(__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
/// Compares each of the corresponding packed 16-bit integer values of
/// the 64-bit integer vectors, and writes the greater value to the
@ -2359,7 +2359,7 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
/// 11: assigned from bits [63:48] of \a a.
/// \returns A 64-bit integer vector containing the shuffled values.
#define _mm_shuffle_pi16(a, n) \
(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
/// Conditionally copies the values from each 8-bit element in the first
/// 64-bit integer vector operand to the specified memory location, as
@ -2601,8 +2601,8 @@ void _mm_setcsr(unsigned int __i);
/// 11: Bits [127:96] copied from the specified operand.
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
#define _mm_shuffle_ps(a, b, mask) \
(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
(int)(mask))
((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
(int)(mask)))
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].

View File

@ -393,3 +393,11 @@ int test_mm_testz_si128(__m128i x, __m128i y) {
// CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
return _mm_testz_si128(x, y);
}
// Make sure brackets work after macro intrinsics.
float pr51324(__m128 a) {
// CHECK-LABEL: pr51324
// CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _mm_round_ps(a, 0)[0];
}