[NFC] Format and uglify PowerPC intrinsics headers

This change formats PowerPC intrinsics wrapper headers into LLVM style,
and add extra prefix '__' to all variables to prevent conflict with user
code.
This commit is contained in:
Qiu Chaofan 2022-03-24 20:46:35 +08:00
parent 028f9f5b2b
commit 895e5b2d80
10 changed files with 3549 additions and 3790 deletions

View File

@ -50,37 +50,37 @@ extern __inline unsigned long long
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64(unsigned long long __X, unsigned long long __M) {
unsigned long result = 0x0UL;
const unsigned long mask = 0x8000000000000000UL;
unsigned long m = __M;
unsigned long c, t;
unsigned long p;
unsigned long __result = 0x0UL;
const unsigned long __mask = 0x8000000000000000UL;
unsigned long __m = __M;
unsigned long __c, __t;
unsigned long __p;
/* The pop-count of the mask gives the number of the bits from
source to process. This is also needed to shift bits from the
source into the correct position for the result. */
p = 64 - __builtin_popcountl(__M);
__p = 64 - __builtin_popcountl(__M);
/* The loop is for the number of '1' bits in the mask and clearing
each mask bit as it is processed. */
while (m != 0) {
c = __builtin_clzl(m);
t = __X << (p - c);
m ^= (mask >> c);
result |= (t & (mask >> c));
p++;
while (__m != 0) {
__c = __builtin_clzl(__m);
__t = __X << (__p - __c);
__m ^= (__mask >> __c);
__result |= (__t & (__mask >> __c));
__p++;
}
return (result);
return __result;
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64(unsigned long long __X, unsigned long long __M) {
unsigned long p = 0x4040404040404040UL; // initial bit permute control
const unsigned long mask = 0x8000000000000000UL;
unsigned long m = __M;
unsigned long c;
unsigned long result;
unsigned long __p = 0x4040404040404040UL; // initial bit permute control
const unsigned long __mask = 0x8000000000000000UL;
unsigned long __m = __M;
unsigned long __c;
unsigned long __result;
/* if the mask is constant and selects 8 bits or less we can use
the Power8 Bit permute instruction. */
@ -88,30 +88,31 @@ extern __inline unsigned long long
/* Also if the pext mask is constant, then the popcount is
constant, we can evaluate the following loop at compile
time and use a constant bit permute vector. */
for (long i = 0; i < __builtin_popcountl(__M); i++) {
c = __builtin_clzl(m);
p = (p << 8) | c;
m ^= (mask >> c);
long __i;
for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
__c = __builtin_clzl(__m);
__p = (__p << 8) | __c;
__m ^= (__mask >> __c);
}
result = __builtin_bpermd(p, __X);
__result = __builtin_bpermd(__p, __X);
} else {
p = 64 - __builtin_popcountl(__M);
result = 0;
__p = 64 - __builtin_popcountl(__M);
__result = 0;
/* We could a use a for loop here, but that combined with
-funroll-loops can expand to a lot of code. The while
loop avoids unrolling and the compiler commons the xor
from clearing the mask bit with the (m != 0) test. The
result is a more compact loop setup and body. */
while (m != 0) {
unsigned long t;
c = __builtin_clzl(m);
t = (__X & (mask >> c)) >> (p - c);
m ^= (mask >> c);
result |= (t);
p++;
while (__m != 0) {
unsigned long __t;
__c = __builtin_clzl(__m);
__t = (__X & (__mask >> __c)) >> (__p - __c);
__m ^= (__mask >> __c);
__result |= (__t);
__p++;
}
}
return (result);
return __result;
}
/* these 32-bit implementations depend on 64-bit pdep/pext

File diff suppressed because it is too large Load Diff

View File

@ -17,31 +17,25 @@
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
may not be visible. */
#ifndef __cplusplus
extern int posix_memalign (void **, size_t, size_t);
extern int posix_memalign(void **, size_t, size_t);
#else
extern "C" int posix_memalign (void **, size_t, size_t);
extern "C" int posix_memalign(void **, size_t, size_t);
#endif
static __inline void *
_mm_malloc (size_t size, size_t alignment)
{
static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
/* PowerPC64 ELF V2 ABI requires quadword alignment. */
size_t vec_align = sizeof (__vector float);
void *ptr;
size_t __vec_align = sizeof(__vector float);
void *__ptr;
if (alignment < vec_align)
alignment = vec_align;
if (posix_memalign (&ptr, alignment, size) == 0)
return ptr;
if (__alignment < __vec_align)
__alignment = __vec_align;
if (posix_memalign(&__ptr, __alignment, __size) == 0)
return __ptr;
else
return NULL;
}
static __inline void
_mm_free (void * ptr)
{
free (ptr);
}
static __inline void _mm_free(void *__ptr) { free(__ptr); }
#else
#include_next <mm_malloc.h>

File diff suppressed because it is too large Load Diff

View File

@ -32,7 +32,8 @@
In the specific case of the monitor and mwait instructions there are
no direct equivalent in the PowerISA at this time. So those
intrinsics are not implemented. */
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
#error \
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
#endif
#ifndef PMMINTRIN_H_
@ -43,106 +44,94 @@
/* We need definitions from the SSE2 and SSE header files*/
#include <emmintrin.h>
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_ps (__m128 __X, __m128 __Y)
{
const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
__v4sf even_neg_Y = vec_xor(__Y, even_n0);
return (__m128) vec_add (__X, even_neg_Y);
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_ps(__m128 __X, __m128 __Y) {
const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
__v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
return (__m128)vec_add(__X, __even_neg_Y);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_pd (__m128d __X, __m128d __Y)
{
const __v2df even_n0 = {-0.0, 0.0};
__v2df even_neg_Y = vec_xor(__Y, even_n0);
return (__m128d) vec_add (__X, even_neg_Y);
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_pd(__m128d __X, __m128d __Y) {
const __v2df __even_n0 = {-0.0, 0.0};
__v2df __even_neg_Y = vec_xor(__Y, __even_n0);
return (__m128d)vec_add(__X, __even_neg_Y);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_ps (__m128 __X, __m128 __Y)
{
__vector unsigned char xform2 = {
0x00, 0x01, 0x02, 0x03,
0x08, 0x09, 0x0A, 0x0B,
0x10, 0x11, 0x12, 0x13,
0x18, 0x19, 0x1A, 0x1B
};
__vector unsigned char xform1 = {
0x04, 0x05, 0x06, 0x07,
0x0C, 0x0D, 0x0E, 0x0F,
0x14, 0x15, 0x16, 0x17,
0x1C, 0x1D, 0x1E, 0x1F
};
return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_ps(__m128 __X, __m128 __Y) {
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
0x18, 0x19, 0x1A, 0x1B};
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
0x1C, 0x1D, 0x1E, 0x1F};
return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_ps (__m128 __X, __m128 __Y)
{
__vector unsigned char xform2 = {
0x00, 0x01, 0x02, 0x03,
0x08, 0x09, 0x0A, 0x0B,
0x10, 0x11, 0x12, 0x13,
0x18, 0x19, 0x1A, 0x1B
};
__vector unsigned char xform1 = {
0x04, 0x05, 0x06, 0x07,
0x0C, 0x0D, 0x0E, 0x0F,
0x14, 0x15, 0x16, 0x17,
0x1C, 0x1D, 0x1E, 0x1F
};
return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_ps(__m128 __X, __m128 __Y) {
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
0x18, 0x19, 0x1A, 0x1B};
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
0x1C, 0x1D, 0x1E, 0x1F};
return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pd (__m128d __X, __m128d __Y)
{
return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
vec_mergel ((__v2df) __X, (__v2df)__Y));
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pd(__m128d __X, __m128d __Y) {
return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y),
vec_mergel((__v2df)__X, (__v2df)__Y));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pd (__m128d __X, __m128d __Y)
{
return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
vec_mergel ((__v2df) __X, (__v2df)__Y));
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pd(__m128d __X, __m128d __Y) {
return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y),
vec_mergel((__v2df)__X, (__v2df)__Y));
}
#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps (__m128 __X)
{
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps(__m128 __X) {
return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps (__m128 __X)
{
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps(__m128 __X) {
return (__m128)vec_mergee((__v4su)__X, (__v4su)__X);
}
#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd (double const *__P)
{
return (__m128d) vec_splats (*__P);
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd(double const *__P) {
return (__m128d)vec_splats(*__P);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movedup_pd (__m128d __X)
{
return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movedup_pd(__m128d __X) {
return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lddqu_si128 (__m128i const *__P)
{
return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lddqu_si128(__m128i const *__P) {
return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
}
/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */

File diff suppressed because it is too large Load Diff

View File

@ -33,471 +33,415 @@
#include <pmmintrin.h>
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16 (__m128i __A)
{
return (__m128i) vec_abs ((__v8hi) __A);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16(__m128i __A) {
return (__m128i)vec_abs((__v8hi)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32 (__m128i __A)
{
return (__m128i) vec_abs ((__v4si) __A);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32(__m128i __A) {
return (__m128i)vec_abs((__v4si)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8 (__m128i __A)
{
return (__m128i) vec_abs ((__v16qi) __A);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8(__m128i __A) {
return (__m128i)vec_abs((__v16qi)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16 (__m64 __A)
{
__v8hi __B = (__v8hi) (__v2du) { __A, __A };
return (__m64) ((__v2du) vec_abs (__B))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16(__m64 __A) {
__v8hi __B = (__v8hi)(__v2du){__A, __A};
return (__m64)((__v2du)vec_abs(__B))[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32 (__m64 __A)
{
__v4si __B = (__v4si) (__v2du) { __A, __A };
return (__m64) ((__v2du) vec_abs (__B))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32(__m64 __A) {
__v4si __B = (__v4si)(__v2du){__A, __A};
return (__m64)((__v2du)vec_abs(__B))[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8 (__m64 __A)
{
__v16qi __B = (__v16qi) (__v2du) { __A, __A };
return (__m64) ((__v2du) vec_abs (__B))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8(__m64 __A) {
__v16qi __B = (__v16qi)(__v2du){__A, __A};
return (__m64)((__v2du)vec_abs(__B))[0];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
{
if (__builtin_constant_p (__count) && __count < 16)
{
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
if (__builtin_constant_p(__count) && __count < 16) {
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_reve ((__v16qu) __A);
__B = (__m128i) vec_reve ((__v16qu) __B);
__A = (__m128i)vec_reve((__v16qu)__A);
__B = (__m128i)vec_reve((__v16qu)__B);
#endif
__A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
__A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_reve ((__v16qu) __A);
__A = (__m128i)vec_reve((__v16qu)__A);
#endif
return __A;
}
return __A;
}
if (__count == 0)
return __B;
if (__count >= 16)
{
if (__count >= 32)
{
const __v16qu zero = { 0 };
return (__m128i) zero;
}
else
{
const __v16qu __shift =
vec_splats ((unsigned char) ((__count - 16) * 8));
if (__count >= 16) {
if (__count >= 32) {
const __v16qu __zero = {0};
return (__m128i)__zero;
} else {
const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
#ifdef __LITTLE_ENDIAN__
return (__m128i) vec_sro ((__v16qu) __A, __shift);
return (__m128i)vec_sro((__v16qu)__A, __shift);
#else
return (__m128i) vec_slo ((__v16qu) __A, __shift);
return (__m128i)vec_slo((__v16qu)__A, __shift);
#endif
}
}
else
{
const __v16qu __shiftA =
vec_splats ((unsigned char) ((16 - __count) * 8));
const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
} else {
const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
__B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
__A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
__B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
#else
__A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
__B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
__A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
__B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
#endif
return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
}
return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
}
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
{
if (__count < 16)
{
__v2du __C = { __B, __A };
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
if (__count < 16) {
__v2du __C = {__B, __A};
#ifdef __LITTLE_ENDIAN__
const __v4su __shift = { __count << 3, 0, 0, 0 };
__C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
const __v4su __shift = {__count << 3, 0, 0, 0};
__C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
#else
const __v4su __shift = { 0, 0, 0, __count << 3 };
__C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
const __v4su __shift = {0, 0, 0, __count << 3};
__C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
#endif
return (__m64) __C[0];
}
else
{
const __m64 __zero = { 0 };
return __zero;
}
return (__m64)__C[0];
} else {
const __m64 __zero = {0};
return __zero;
}
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16 (__m128i __A, __m128i __B)
{
const __v16qu __P =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
const __v16qu __Q =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
return (__m128i) vec_add (__C, __D);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
return (__m128i)vec_add(__C, __D);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32 (__m128i __A, __m128i __B)
{
const __v16qu __P =
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
const __v16qu __Q =
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
__v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
__v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
return (__m128i) vec_add (__C, __D);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
16, 17, 18, 19, 24, 25, 26, 27};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
20, 21, 22, 23, 28, 29, 30, 31};
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
return (__m128i)vec_add(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16 (__m64 __A, __m64 __B)
{
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
const __v16qu __P =
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
const __v16qu __Q =
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
__v8hi __D = vec_perm (__C, __C, __Q);
__C = vec_perm (__C, __C, __P);
__C = vec_add (__C, __D);
return (__m64) ((__v2du) __C)[1];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16(__m64 __A, __m64 __B) {
__v8hi __C = (__v8hi)(__v2du){__A, __B};
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
__v8hi __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_add(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32 (__m64 __A, __m64 __B)
{
__v4si __C = (__v4si) (__v2du) { __A, __B };
const __v16qu __P =
{ 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
const __v16qu __Q =
{ 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
__v4si __D = vec_perm (__C, __C, __Q);
__C = vec_perm (__C, __C, __P);
__C = vec_add (__C, __D);
return (__m64) ((__v2du) __C)[1];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32(__m64 __A, __m64 __B) {
__v4si __C = (__v4si)(__v2du){__A, __B};
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
__v4si __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_add(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16 (__m128i __A, __m128i __B)
{
__v4si __C = { 0 }, __D = { 0 };
__C = vec_sum4s ((__v8hi) __A, __C);
__D = vec_sum4s ((__v8hi) __B, __D);
__C = (__v4si) vec_packs (__C, __D);
return (__m128i) __C;
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16(__m128i __A, __m128i __B) {
__v4si __C = {0}, __D = {0};
__C = vec_sum4s((__v8hi)__A, __C);
__D = vec_sum4s((__v8hi)__B, __D);
__C = (__v4si)vec_packs(__C, __D);
return (__m128i)__C;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16 (__m64 __A, __m64 __B)
{
const __v4si __zero = { 0 };
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
__v4si __D = vec_sum4s (__C, __zero);
__C = vec_packs (__D, __D);
return (__m64) ((__v2du) __C)[1];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16(__m64 __A, __m64 __B) {
const __v4si __zero = {0};
__v8hi __C = (__v8hi)(__v2du){__A, __B};
__v4si __D = vec_sum4s(__C, __zero);
__C = vec_packs(__D, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16 (__m128i __A, __m128i __B)
{
const __v16qu __P =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
const __v16qu __Q =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
return (__m128i) vec_sub (__C, __D);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
return (__m128i)vec_sub(__C, __D);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32 (__m128i __A, __m128i __B)
{
const __v16qu __P =
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
const __v16qu __Q =
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
__v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
__v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
return (__m128i) vec_sub (__C, __D);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
16, 17, 18, 19, 24, 25, 26, 27};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
20, 21, 22, 23, 28, 29, 30, 31};
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
return (__m128i)vec_sub(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16 (__m64 __A, __m64 __B)
{
const __v16qu __P =
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
const __v16qu __Q =
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
__v8hi __D = vec_perm (__C, __C, __Q);
__C = vec_perm (__C, __C, __P);
__C = vec_sub (__C, __D);
return (__m64) ((__v2du) __C)[1];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16(__m64 __A, __m64 __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
__v8hi __C = (__v8hi)(__v2du){__A, __B};
__v8hi __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_sub(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32 (__m64 __A, __m64 __B)
{
const __v16qu __P =
{ 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
const __v16qu __Q =
{ 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
__v4si __C = (__v4si) (__v2du) { __A, __B };
__v4si __D = vec_perm (__C, __C, __Q);
__C = vec_perm (__C, __C, __P);
__C = vec_sub (__C, __D);
return (__m64) ((__v2du) __C)[1];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32(__m64 __A, __m64 __B) {
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
__v4si __C = (__v4si)(__v2du){__A, __B};
__v4si __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_sub(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16 (__m128i __A, __m128i __B)
{
const __v16qu __P =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
const __v16qu __Q =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
return (__m128i) vec_subs (__C, __D);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
return (__m128i)vec_subs(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16 (__m64 __A, __m64 __B)
{
const __v16qu __P =
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
const __v16qu __Q =
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
__v8hi __D = vec_perm (__C, __C, __P);
__v8hi __E = vec_perm (__C, __C, __Q);
__C = vec_subs (__D, __E);
return (__m64) ((__v2du) __C)[1];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16(__m64 __A, __m64 __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
__v8hi __C = (__v8hi)(__v2du){__A, __B};
__v8hi __D = vec_perm(__C, __C, __P);
__v8hi __E = vec_perm(__C, __C, __Q);
__C = vec_subs(__D, __E);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8 (__m128i __A, __m128i __B)
{
const __v16qi __zero = { 0 };
__vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
__v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
return (__m128i) vec_sel (__C, __zero, __select);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8(__m128i __A, __m128i __B) {
const __v16qi __zero = {0};
__vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
__v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
return (__m128i)vec_sel(__C, __zero, __select);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8 (__m64 __A, __m64 __B)
{
const __v16qi __zero = { 0 };
__v16qi __C = (__v16qi) (__v2du) { __A, __A };
__v16qi __D = (__v16qi) (__v2du) { __B, __B };
__vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
__C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
__C = vec_sel (__C, __zero, __select);
return (__m64) ((__v2du) (__C))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8(__m64 __A, __m64 __B) {
const __v16qi __zero = {0};
__v16qi __C = (__v16qi)(__v2du){__A, __A};
__v16qi __D = (__v16qi)(__v2du){__B, __B};
__vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
__C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
__C = vec_sel(__C, __zero, __select);
return (__m64)((__v2du)(__C))[0];
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8 (__m128i __A, __m128i __B)
{
const __v16qi __zero = { 0 };
__v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8(__m128i __A, __m128i __B) {
const __v16qi __zero = {0};
__v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
__v16qi __selectpos =
(__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
__v16qi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
(__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
__v16qi __conv = vec_add(__selectneg, __selectpos);
return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16 (__m128i __A, __m128i __B)
{
const __v8hi __zero = { 0 };
__v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
__v8hi __selectpos =
(__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
__v8hi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16(__m128i __A, __m128i __B) {
const __v8hi __zero = {0};
__v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
__v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
__v8hi __conv = vec_add(__selectneg, __selectpos);
return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32 (__m128i __A, __m128i __B)
{
const __v4si __zero = { 0 };
__v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
__v4si __selectpos =
(__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
__v4si __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32(__m128i __A, __m128i __B) {
const __v4si __zero = {0};
__v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
__v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
__v4si __conv = vec_add(__selectneg, __selectpos);
return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8 (__m64 __A, __m64 __B)
{
const __v16qi __zero = { 0 };
__v16qi __C = (__v16qi) (__v2du) { __A, __A };
__v16qi __D = (__v16qi) (__v2du) { __B, __B };
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8(__m64 __A, __m64 __B) {
const __v16qi __zero = {0};
__v16qi __C = (__v16qi)(__v2du){__A, __A};
__v16qi __D = (__v16qi)(__v2du){__B, __B};
__C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
return (__m64)((__v2du)(__C))[0];
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16 (__m64 __A, __m64 __B)
{
const __v8hi __zero = { 0 };
__v8hi __C = (__v8hi) (__v2du) { __A, __A };
__v8hi __D = (__v8hi) (__v2du) { __B, __B };
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16(__m64 __A, __m64 __B) {
const __v8hi __zero = {0};
__v8hi __C = (__v8hi)(__v2du){__A, __A};
__v8hi __D = (__v8hi)(__v2du){__B, __B};
__C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
return (__m64)((__v2du)(__C))[0];
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32 (__m64 __A, __m64 __B)
{
const __v4si __zero = { 0 };
__v4si __C = (__v4si) (__v2du) { __A, __A };
__v4si __D = (__v4si) (__v2du) { __B, __B };
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32(__m64 __A, __m64 __B) {
const __v4si __zero = {0};
__v4si __C = (__v4si)(__v2du){__A, __A};
__v4si __D = (__v4si)(__v2du){__B, __B};
__C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
return (__m64)((__v2du)(__C))[0];
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16 (__m128i __A, __m128i __B)
{
__v8hi __unsigned = vec_splats ((signed short) 0x00ff);
__v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
__v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
__v8hi __E = vec_unpackh ((__v16qi) __B);
__v8hi __F = vec_unpackl ((__v16qi) __B);
__C = vec_mul (__C, __E);
__D = vec_mul (__D, __F);
const __v16qu __odds =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
const __v16qu __evens =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
__E = vec_perm (__C, __D, __odds);
__F = vec_perm (__C, __D, __evens);
return (__m128i) vec_adds (__E, __F);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16(__m128i __A, __m128i __B) {
__v8hi __unsigned = vec_splats((signed short)0x00ff);
__v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
__v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
__v8hi __E = vec_unpackh((__v16qi)__B);
__v8hi __F = vec_unpackl((__v16qi)__B);
__C = vec_mul(__C, __E);
__D = vec_mul(__D, __F);
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__E = vec_perm(__C, __D, __odds);
__F = vec_perm(__C, __D, __evens);
return (__m128i)vec_adds(__E, __F);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16 (__m64 __A, __m64 __B)
{
__v8hi __C = (__v8hi) (__v2du) { __A, __A };
__C = vec_unpackl ((__v16qi) __C);
const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
__C = vec_and (__C, __unsigned);
__v8hi __D = (__v8hi) (__v2du) { __B, __B };
__D = vec_unpackl ((__v16qi) __D);
__D = vec_mul (__C, __D);
const __v16qu __odds =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
const __v16qu __evens =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
__C = vec_perm (__D, __D, __odds);
__D = vec_perm (__D, __D, __evens);
__C = vec_adds (__C, __D);
return (__m64) ((__v2du) (__C))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16(__m64 __A, __m64 __B) {
__v8hi __C = (__v8hi)(__v2du){__A, __A};
__C = vec_unpackl((__v16qi)__C);
const __v8hi __unsigned = vec_splats((signed short)0x00ff);
__C = vec_and(__C, __unsigned);
__v8hi __D = (__v8hi)(__v2du){__B, __B};
__D = vec_unpackl((__v16qi)__D);
__D = vec_mul(__C, __D);
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__C = vec_perm(__D, __D, __odds);
__D = vec_perm(__D, __D, __evens);
__C = vec_adds(__C, __D);
return (__m64)((__v2du)(__C))[0];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
{
__v4si __C = vec_unpackh ((__v8hi) __A);
__v4si __D = vec_unpackh ((__v8hi) __B);
__C = vec_mul (__C, __D);
__D = vec_unpackl ((__v8hi) __A);
__v4si __E = vec_unpackl ((__v8hi) __B);
__D = vec_mul (__D, __E);
const __v4su __shift = vec_splats ((unsigned int) 14);
__C = vec_sr (__C, __shift);
__D = vec_sr (__D, __shift);
const __v4si __ones = vec_splats ((signed int) 1);
__C = vec_add (__C, __ones);
__C = vec_sr (__C, (__v4su) __ones);
__D = vec_add (__D, __ones);
__D = vec_sr (__D, (__v4su) __ones);
return (__m128i) vec_pack (__C, __D);
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16(__m128i __A, __m128i __B) {
__v4si __C = vec_unpackh((__v8hi)__A);
__v4si __D = vec_unpackh((__v8hi)__B);
__C = vec_mul(__C, __D);
__D = vec_unpackl((__v8hi)__A);
__v4si __E = vec_unpackl((__v8hi)__B);
__D = vec_mul(__D, __E);
const __v4su __shift = vec_splats((unsigned int)14);
__C = vec_sr(__C, __shift);
__D = vec_sr(__D, __shift);
const __v4si __ones = vec_splats((signed int)1);
__C = vec_add(__C, __ones);
__C = vec_sr(__C, (__v4su)__ones);
__D = vec_add(__D, __ones);
__D = vec_sr(__D, (__v4su)__ones);
return (__m128i)vec_pack(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
{
__v4si __C = (__v4si) (__v2du) { __A, __A };
__C = vec_unpackh ((__v8hi) __C);
__v4si __D = (__v4si) (__v2du) { __B, __B };
__D = vec_unpackh ((__v8hi) __D);
__C = vec_mul (__C, __D);
const __v4su __shift = vec_splats ((unsigned int) 14);
__C = vec_sr (__C, __shift);
const __v4si __ones = vec_splats ((signed int) 1);
__C = vec_add (__C, __ones);
__C = vec_sr (__C, (__v4su) __ones);
__v8hi __E = vec_pack (__C, __D);
return (__m64) ((__v2du) (__E))[0];
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16(__m64 __A, __m64 __B) {
__v4si __C = (__v4si)(__v2du){__A, __A};
__C = vec_unpackh((__v8hi)__C);
__v4si __D = (__v4si)(__v2du){__B, __B};
__D = vec_unpackh((__v8hi)__D);
__C = vec_mul(__C, __D);
const __v4su __shift = vec_splats((unsigned int)14);
__C = vec_sr(__C, __shift);
const __v4si __ones = vec_splats((signed int)1);
__C = vec_add(__C, __ones);
__C = vec_sr(__C, (__v4su)__ones);
__v8hi __E = vec_pack(__C, __D);
return (__m64)((__v2du)(__E))[0];
}
#else

File diff suppressed because it is too large Load Diff

View File

@ -8,15 +8,15 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE
// CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
// CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
// CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
// CHECK-BE-DAG: @_mm_shufflelo_epi16.permute_selectors = internal constant [4 x i16] [i16 1, i16 515, i16 1029, i16 1543], align 2
// CHECK-BE-DAG: @_mm_movemask_pd.__perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
// CHECK-BE-DAG: @_mm_shuffle_epi32.__permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
// CHECK-BE-DAG: @_mm_shufflehi_epi16.__permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
// CHECK-BE-DAG: @_mm_shufflelo_epi16.__permute_selectors = internal constant [4 x i16] [i16 1, i16 515, i16 1029, i16 1543], align 2
// CHECK-LE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144>, align 16
// CHECK-LE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
// CHECK-LE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
// CHECK-LE-DAG: @_mm_shufflelo_epi16.permute_selectors = internal constant [4 x i16] [i16 256, i16 770, i16 1284, i16 1798], align 2
// CHECK-LE-DAG: @_mm_movemask_pd.__perm_mask = internal constant <4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144>, align 16
// CHECK-LE-DAG: @_mm_shuffle_epi32.__permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
// CHECK-LE-DAG: @_mm_shufflehi_epi16.__permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
// CHECK-LE-DAG: @_mm_shufflelo_epi16.__permute_selectors = internal constant [4 x i16] [i16 256, i16 770, i16 1284, i16 1798], align 2
#include <emmintrin.h>
@ -1008,14 +1008,14 @@ test_shuffle() {
// CHECK: %[[SHR:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
// CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR]], 3
// CHECK: sext i32 %[[AND4]] to i64
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
// CHECK: call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])
@ -1046,7 +1046,7 @@ test_shuffle() {
// CHECK: sext i32 %[[AND4]] to i64
// CHECK-LE: store <2 x i64> <i64 1663540288323457296, i64 0>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK-BE: store <2 x i64> <i64 1157726452361532951, i64 0>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflehi_epi16.permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
// CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_shufflelo_epi16
@ -1063,7 +1063,7 @@ test_shuffle() {
// CHECK: sext i32 %[[AND4]] to i64
// CHECK-LE: store <2 x i64> <i64 0, i64 2242261671028070680>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK-BE: store <2 x i64> <i64 0, i64 1736447835066146335>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflelo_epi16.permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
// CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
void __attribute__((noinline))

View File

@ -29,11 +29,11 @@ float fs[4];
int i, i2;
long long i64;
// CHECK-LE-DAG: @_mm_shuffle_pi16.permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
// CHECK-BE-DAG: @_mm_shuffle_pi16.permute_selectors = internal constant [4 x i16] [i16 1543, i16 1029, i16 515, i16 1], align 2
// CHECK-LE-DAG: @_mm_shuffle_pi16.__permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
// CHECK-BE-DAG: @_mm_shuffle_pi16.__permute_selectors = internal constant [4 x i16] [i16 1543, i16 1029, i16 515, i16 1], align 2
// CHECK-LE-DAG: @_mm_shuffle_ps.permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
// CHECK-BE-DAG: @_mm_shuffle_ps.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
// CHECK-LE-DAG: @_mm_shuffle_ps.__permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
// CHECK-BE-DAG: @_mm_shuffle_ps.__permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
void __attribute__((noinline))
test_add() {
@ -887,16 +887,16 @@ test_shuffle() {
// CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
// CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
// CHECK: sext i32 %[[AND4]] to i64
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
// CHECK: call <2 x i64> @vec_splats(unsigned long long)
@ -916,14 +916,14 @@ test_shuffle() {
// CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
// CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
// CHECK: sext i32 %[[AND4]] to i64
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
// CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
// CHECK: %[[ADD2:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD2]], i32 3
// CHECK: call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])