forked from OSchip/llvm-project
663 lines
22 KiB
C
663 lines
22 KiB
C
/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
|
|
*
|
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
* See https://llvm.org/LICENSE.txt for license information.
|
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
*
|
|
*===-----------------------------------------------------------------------===
|
|
*/
|
|
|
|
/* Implemented from the specification included in the Intel C++ Compiler
|
|
User Guide and Reference, version 9.0.
|
|
|
|
NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */
|
|
|
|
#ifndef NO_WARN_X86_INTRINSICS
|
|
/* This header is distributed to simplify porting x86_64 code that
|
|
makes explicit use of Intel intrinsics to powerp64/powerpc64le.
|
|
|
|
It is the user's responsibility to determine if the results are
|
|
acceptable and make additional changes as necessary.
|
|
|
|
Note that much code that uses Intel intrinsics can be rewritten in
|
|
standard C or GNU C extensions, which are more portable and better
|
|
optimized across multiple targets. */
|
|
#error \
|
|
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
|
#endif
|
|
|
|
#ifndef SMMINTRIN_H_
|
|
#define SMMINTRIN_H_
|
|
|
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
|
|
|
#include <altivec.h>
|
|
#include <tmmintrin.h>
|
|
|
|
/* Rounding mode macros. */
|
|
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
|
#define _MM_FROUND_TO_ZERO 0x01
|
|
#define _MM_FROUND_TO_POS_INF 0x02
|
|
#define _MM_FROUND_TO_NEG_INF 0x03
|
|
#define _MM_FROUND_CUR_DIRECTION 0x04
|
|
|
|
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
|
|
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
|
|
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
|
|
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
|
|
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
|
|
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
|
|
|
|
#define _MM_FROUND_RAISE_EXC 0x00
|
|
#define _MM_FROUND_NO_EXC 0x08
|
|
|
|
extern __inline __m128d
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_round_pd(__m128d __A, int __rounding) {
|
|
__v2df __r;
|
|
union {
|
|
double __fr;
|
|
long long __fpscr;
|
|
} __enables_save, __fpscr_save;
|
|
|
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
|
/* Save enabled exceptions, disable all exceptions,
|
|
and preserve the rounding mode. */
|
|
#ifdef _ARCH_PWR9
|
|
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
|
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
|
#else
|
|
__fpscr_save.__fr = __builtin_mffs();
|
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
|
__fpscr_save.__fpscr &= ~0xf8;
|
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
|
#endif
|
|
/* Insert an artificial "read/write" reference to the variable
|
|
read below, to ensure the compiler does not schedule
|
|
a read/use of the variable before the FPSCR is modified, above.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : "+wa"(__A));
|
|
}
|
|
|
|
switch (__rounding) {
|
|
case _MM_FROUND_TO_NEAREST_INT:
|
|
__fpscr_save.__fr = __builtin_mffsl();
|
|
__attribute__((fallthrough));
|
|
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
|
|
__builtin_set_fpscr_rn(0b00);
|
|
/* Insert an artificial "read/write" reference to the variable
|
|
read below, to ensure the compiler does not schedule
|
|
a read/use of the variable before the FPSCR is modified, above.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : "+wa"(__A));
|
|
|
|
__r = vec_rint((__v2df)__A);
|
|
|
|
/* Insert an artificial "read" reference to the variable written
|
|
above, to ensure the compiler does not schedule the computation
|
|
of the value after the manipulation of the FPSCR, below.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : : "wa"(__r));
|
|
__builtin_set_fpscr_rn(__fpscr_save.__fpscr);
|
|
break;
|
|
case _MM_FROUND_TO_NEG_INF:
|
|
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
|
|
__r = vec_floor((__v2df)__A);
|
|
break;
|
|
case _MM_FROUND_TO_POS_INF:
|
|
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
|
|
__r = vec_ceil((__v2df)__A);
|
|
break;
|
|
case _MM_FROUND_TO_ZERO:
|
|
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
|
|
__r = vec_trunc((__v2df)__A);
|
|
break;
|
|
case _MM_FROUND_CUR_DIRECTION:
|
|
__r = vec_rint((__v2df)__A);
|
|
break;
|
|
}
|
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
|
/* Insert an artificial "read" reference to the variable written
|
|
above, to ensure the compiler does not schedule the computation
|
|
of the value after the manipulation of the FPSCR, below.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : : "wa"(__r));
|
|
/* Restore enabled exceptions. */
|
|
__fpscr_save.__fr = __builtin_mffsl();
|
|
__fpscr_save.__fpscr |= __enables_save.__fpscr;
|
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
|
}
|
|
return (__m128d)__r;
|
|
}
|
|
|
|
extern __inline __m128d
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
|
|
__B = _mm_round_pd(__B, __rounding);
|
|
__v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
|
|
return (__m128d)__r;
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_round_ps(__m128 __A, int __rounding) {
|
|
__v4sf __r;
|
|
union {
|
|
double __fr;
|
|
long long __fpscr;
|
|
} __enables_save, __fpscr_save;
|
|
|
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
|
/* Save enabled exceptions, disable all exceptions,
|
|
and preserve the rounding mode. */
|
|
#ifdef _ARCH_PWR9
|
|
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
|
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
|
#else
|
|
__fpscr_save.__fr = __builtin_mffs();
|
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
|
__fpscr_save.__fpscr &= ~0xf8;
|
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
|
#endif
|
|
/* Insert an artificial "read/write" reference to the variable
|
|
read below, to ensure the compiler does not schedule
|
|
a read/use of the variable before the FPSCR is modified, above.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : "+wa"(__A));
|
|
}
|
|
|
|
switch (__rounding) {
|
|
case _MM_FROUND_TO_NEAREST_INT:
|
|
__fpscr_save.__fr = __builtin_mffsl();
|
|
__attribute__((fallthrough));
|
|
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
|
|
__builtin_set_fpscr_rn(0b00);
|
|
/* Insert an artificial "read/write" reference to the variable
|
|
read below, to ensure the compiler does not schedule
|
|
a read/use of the variable before the FPSCR is modified, above.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : "+wa"(__A));
|
|
|
|
__r = vec_rint((__v4sf)__A);
|
|
|
|
/* Insert an artificial "read" reference to the variable written
|
|
above, to ensure the compiler does not schedule the computation
|
|
of the value after the manipulation of the FPSCR, below.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : : "wa"(__r));
|
|
__builtin_set_fpscr_rn(__fpscr_save.__fpscr);
|
|
break;
|
|
case _MM_FROUND_TO_NEG_INF:
|
|
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
|
|
__r = vec_floor((__v4sf)__A);
|
|
break;
|
|
case _MM_FROUND_TO_POS_INF:
|
|
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
|
|
__r = vec_ceil((__v4sf)__A);
|
|
break;
|
|
case _MM_FROUND_TO_ZERO:
|
|
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
|
|
__r = vec_trunc((__v4sf)__A);
|
|
break;
|
|
case _MM_FROUND_CUR_DIRECTION:
|
|
__r = vec_rint((__v4sf)__A);
|
|
break;
|
|
}
|
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
|
/* Insert an artificial "read" reference to the variable written
|
|
above, to ensure the compiler does not schedule the computation
|
|
of the value after the manipulation of the FPSCR, below.
|
|
This can be removed if and when GCC PR102783 is fixed.
|
|
*/
|
|
__asm__("" : : "wa"(__r));
|
|
/* Restore enabled exceptions. */
|
|
__fpscr_save.__fr = __builtin_mffsl();
|
|
__fpscr_save.__fpscr |= __enables_save.__fpscr;
|
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
|
}
|
|
return (__m128)__r;
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
|
|
__B = _mm_round_ps(__B, __rounding);
|
|
__v4sf __r = (__v4sf)__A;
|
|
__r[0] = ((__v4sf)__B)[0];
|
|
return (__m128)__r;
|
|
}
|
|
|
|
#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
|
|
#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
|
|
|
|
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
|
|
#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
|
|
|
|
#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
|
|
#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
|
|
|
|
#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
|
|
#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
|
|
__v16qi __result = (__v16qi)__A;
|
|
|
|
__result[__N & 0xf] = __D;
|
|
|
|
return (__m128i)__result;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
|
|
__v4si __result = (__v4si)__A;
|
|
|
|
__result[__N & 3] = __D;
|
|
|
|
return (__m128i)__result;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
|
|
__v2di __result = (__v2di)__A;
|
|
|
|
__result[__N & 1] = __D;
|
|
|
|
return (__m128i)__result;
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_extract_epi8(__m128i __X, const int __N) {
|
|
return (unsigned char)((__v16qi)__X)[__N & 15];
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_extract_epi32(__m128i __X, const int __N) {
|
|
return ((__v4si)__X)[__N & 3];
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_extract_epi64(__m128i __X, const int __N) {
|
|
return ((__v2di)__X)[__N & 1];
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_extract_ps(__m128 __X, const int __N) {
|
|
return ((__v4si)__X)[__N & 3];
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
|
|
__v16qi __charmask = vec_splats((signed char)__imm8);
|
|
__charmask = vec_gb(__charmask);
|
|
__v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
|
|
#ifdef __BIG_ENDIAN__
|
|
__shortmask = vec_reve(__shortmask);
|
|
#endif
|
|
return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
|
|
}
|
|
#endif
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
|
|
#ifdef _ARCH_PWR10
|
|
return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
|
|
#else
|
|
const __v16qu __seven = vec_splats((unsigned char)0x07);
|
|
__v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
|
|
return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
|
|
__v16qu __pcv[] = {
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
|
|
{16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
|
|
{0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
|
|
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
|
|
{16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
|
|
{0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
|
|
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
|
|
{16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
|
|
{0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
|
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
|
};
|
|
__v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
|
|
return (__m128)__r;
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
|
|
#ifdef _ARCH_PWR10
|
|
return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
|
|
#else
|
|
const __v4si __zero = {0};
|
|
const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
|
|
return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m128d
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
|
|
__v16qu __pcv[] = {
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
|
|
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
|
|
__v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
|
|
return (__m128d)__r;
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128d
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
|
|
#ifdef _ARCH_PWR10
|
|
return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
|
|
#else
|
|
const __v2di __zero = {0};
|
|
const __vector __bool long long __boolmask =
|
|
vec_cmplt((__v2di)__mask, __zero);
|
|
return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_testz_si128(__m128i __A, __m128i __B) {
|
|
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
|
const __v16qu __zero = {0};
|
|
return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_testc_si128(__m128i __A, __m128i __B) {
|
|
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
|
const __v16qu __zero = {0};
|
|
const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
|
|
return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_testnzc_si128(__m128i __A, __m128i __B) {
|
|
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
|
return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
|
|
}
|
|
|
|
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
|
|
|
|
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
|
|
|
|
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
|
|
}
|
|
#endif
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_epi8(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_epu16(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_epi32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_epu32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_epi8(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_epu16(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_epi32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_epu32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mullo_epi32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mul_epi32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
|
|
}
|
|
#endif
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepi8_epi16(__m128i __A) {
|
|
return (__m128i)vec_unpackh((__v16qi)__A);
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepi8_epi32(__m128i __A) {
|
|
__A = (__m128i)vec_unpackh((__v16qi)__A);
|
|
return (__m128i)vec_unpackh((__v8hi)__A);
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepi8_epi64(__m128i __A) {
|
|
__A = (__m128i)vec_unpackh((__v16qi)__A);
|
|
__A = (__m128i)vec_unpackh((__v8hi)__A);
|
|
return (__m128i)vec_unpackh((__v4si)__A);
|
|
}
|
|
#endif
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepi16_epi32(__m128i __A) {
|
|
return (__m128i)vec_unpackh((__v8hi)__A);
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepi16_epi64(__m128i __A) {
|
|
__A = (__m128i)vec_unpackh((__v8hi)__A);
|
|
return (__m128i)vec_unpackh((__v4si)__A);
|
|
}
|
|
#endif
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepi32_epi64(__m128i __A) {
|
|
return (__m128i)vec_unpackh((__v4si)__A);
|
|
}
|
|
#endif
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepu8_epi16(__m128i __A) {
|
|
const __v16qu __zero = {0};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
|
|
#else /* __BIG_ENDIAN__. */
|
|
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
|
|
#endif /* __BIG_ENDIAN__. */
|
|
return __A;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepu8_epi32(__m128i __A) {
|
|
const __v16qu __zero = {0};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
|
|
__A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
|
|
#else /* __BIG_ENDIAN__. */
|
|
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
|
|
__A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
|
|
#endif /* __BIG_ENDIAN__. */
|
|
return __A;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepu8_epi64(__m128i __A) {
|
|
const __v16qu __zero = {0};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
|
|
__A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
|
|
__A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
|
|
#else /* __BIG_ENDIAN__. */
|
|
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
|
|
__A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
|
|
__A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
|
|
#endif /* __BIG_ENDIAN__. */
|
|
return __A;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepu16_epi32(__m128i __A) {
|
|
const __v8hu __zero = {0};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
|
|
#else /* __BIG_ENDIAN__. */
|
|
__A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
|
|
#endif /* __BIG_ENDIAN__. */
|
|
return __A;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepu16_epi64(__m128i __A) {
|
|
const __v8hu __zero = {0};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
|
|
__A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
|
|
#else /* __BIG_ENDIAN__. */
|
|
__A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
|
|
__A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
|
|
#endif /* __BIG_ENDIAN__. */
|
|
return __A;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtepu32_epi64(__m128i __A) {
|
|
const __v4su __zero = {0};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__A = (__m128i)vec_mergeh((__v4su)__A, __zero);
|
|
#else /* __BIG_ENDIAN__. */
|
|
__A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
|
|
#endif /* __BIG_ENDIAN__. */
|
|
return __A;
|
|
}
|
|
|
|
/* Return horizontal packed word minimum and its index in bits [15:0]
|
|
and bits [18:16] respectively. */
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_minpos_epu16(__m128i __A) {
|
|
union __u {
|
|
__m128i __m;
|
|
__v8hu __uh;
|
|
};
|
|
union __u __u = {.__m = __A}, __r = {.__m = {0}};
|
|
unsigned short __ridx = 0;
|
|
unsigned short __rmin = __u.__uh[__ridx];
|
|
unsigned long __i;
|
|
for (__i = 1; __i < 8; __i++) {
|
|
if (__u.__uh[__i] < __rmin) {
|
|
__rmin = __u.__uh[__i];
|
|
__ridx = __i;
|
|
}
|
|
}
|
|
__r.__uh[0] = __rmin;
|
|
__r.__uh[1] = __ridx;
|
|
return __r.__m;
|
|
}
|
|
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_packus_epi32(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
extern __inline __m128i
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
|
|
return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
|
|
}
|
|
#endif
|
|
|
|
#else
|
|
#include_next <smmintrin.h>
|
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
|
*/
|
|
|
|
#endif /* SMMINTRIN_H_ */
|