forked from OSchip/llvm-project
[NFC] Format and uglify PowerPC intrinsics headers
This change formats PowerPC intrinsics wrapper headers into LLVM style, and add extra prefix '__' to all variables to prevent conflict with user code.
This commit is contained in:
parent
028f9f5b2b
commit
895e5b2d80
|
@ -50,37 +50,37 @@ extern __inline unsigned long long
|
|||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_pdep_u64(unsigned long long __X, unsigned long long __M) {
|
||||
unsigned long result = 0x0UL;
|
||||
const unsigned long mask = 0x8000000000000000UL;
|
||||
unsigned long m = __M;
|
||||
unsigned long c, t;
|
||||
unsigned long p;
|
||||
unsigned long __result = 0x0UL;
|
||||
const unsigned long __mask = 0x8000000000000000UL;
|
||||
unsigned long __m = __M;
|
||||
unsigned long __c, __t;
|
||||
unsigned long __p;
|
||||
|
||||
/* The pop-count of the mask gives the number of the bits from
|
||||
source to process. This is also needed to shift bits from the
|
||||
source into the correct position for the result. */
|
||||
p = 64 - __builtin_popcountl(__M);
|
||||
__p = 64 - __builtin_popcountl(__M);
|
||||
|
||||
/* The loop is for the number of '1' bits in the mask and clearing
|
||||
each mask bit as it is processed. */
|
||||
while (m != 0) {
|
||||
c = __builtin_clzl(m);
|
||||
t = __X << (p - c);
|
||||
m ^= (mask >> c);
|
||||
result |= (t & (mask >> c));
|
||||
p++;
|
||||
while (__m != 0) {
|
||||
__c = __builtin_clzl(__m);
|
||||
__t = __X << (__p - __c);
|
||||
__m ^= (__mask >> __c);
|
||||
__result |= (__t & (__mask >> __c));
|
||||
__p++;
|
||||
}
|
||||
return (result);
|
||||
return __result;
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_pext_u64(unsigned long long __X, unsigned long long __M) {
|
||||
unsigned long p = 0x4040404040404040UL; // initial bit permute control
|
||||
const unsigned long mask = 0x8000000000000000UL;
|
||||
unsigned long m = __M;
|
||||
unsigned long c;
|
||||
unsigned long result;
|
||||
unsigned long __p = 0x4040404040404040UL; // initial bit permute control
|
||||
const unsigned long __mask = 0x8000000000000000UL;
|
||||
unsigned long __m = __M;
|
||||
unsigned long __c;
|
||||
unsigned long __result;
|
||||
|
||||
/* if the mask is constant and selects 8 bits or less we can use
|
||||
the Power8 Bit permute instruction. */
|
||||
|
@ -88,30 +88,31 @@ extern __inline unsigned long long
|
|||
/* Also if the pext mask is constant, then the popcount is
|
||||
constant, we can evaluate the following loop at compile
|
||||
time and use a constant bit permute vector. */
|
||||
for (long i = 0; i < __builtin_popcountl(__M); i++) {
|
||||
c = __builtin_clzl(m);
|
||||
p = (p << 8) | c;
|
||||
m ^= (mask >> c);
|
||||
long __i;
|
||||
for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
|
||||
__c = __builtin_clzl(__m);
|
||||
__p = (__p << 8) | __c;
|
||||
__m ^= (__mask >> __c);
|
||||
}
|
||||
result = __builtin_bpermd(p, __X);
|
||||
__result = __builtin_bpermd(__p, __X);
|
||||
} else {
|
||||
p = 64 - __builtin_popcountl(__M);
|
||||
result = 0;
|
||||
__p = 64 - __builtin_popcountl(__M);
|
||||
__result = 0;
|
||||
/* We could a use a for loop here, but that combined with
|
||||
-funroll-loops can expand to a lot of code. The while
|
||||
loop avoids unrolling and the compiler commons the xor
|
||||
from clearing the mask bit with the (m != 0) test. The
|
||||
result is a more compact loop setup and body. */
|
||||
while (m != 0) {
|
||||
unsigned long t;
|
||||
c = __builtin_clzl(m);
|
||||
t = (__X & (mask >> c)) >> (p - c);
|
||||
m ^= (mask >> c);
|
||||
result |= (t);
|
||||
p++;
|
||||
while (__m != 0) {
|
||||
unsigned long __t;
|
||||
__c = __builtin_clzl(__m);
|
||||
__t = (__X & (__mask >> __c)) >> (__p - __c);
|
||||
__m ^= (__mask >> __c);
|
||||
__result |= (__t);
|
||||
__p++;
|
||||
}
|
||||
}
|
||||
return (result);
|
||||
return __result;
|
||||
}
|
||||
|
||||
/* these 32-bit implementations depend on 64-bit pdep/pext
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,31 +17,25 @@
|
|||
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
|
||||
may not be visible. */
|
||||
#ifndef __cplusplus
|
||||
extern int posix_memalign (void **, size_t, size_t);
|
||||
extern int posix_memalign(void **, size_t, size_t);
|
||||
#else
|
||||
extern "C" int posix_memalign (void **, size_t, size_t);
|
||||
extern "C" int posix_memalign(void **, size_t, size_t);
|
||||
#endif
|
||||
|
||||
static __inline void *
|
||||
_mm_malloc (size_t size, size_t alignment)
|
||||
{
|
||||
static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
|
||||
/* PowerPC64 ELF V2 ABI requires quadword alignment. */
|
||||
size_t vec_align = sizeof (__vector float);
|
||||
void *ptr;
|
||||
size_t __vec_align = sizeof(__vector float);
|
||||
void *__ptr;
|
||||
|
||||
if (alignment < vec_align)
|
||||
alignment = vec_align;
|
||||
if (posix_memalign (&ptr, alignment, size) == 0)
|
||||
return ptr;
|
||||
if (__alignment < __vec_align)
|
||||
__alignment = __vec_align;
|
||||
if (posix_memalign(&__ptr, __alignment, __size) == 0)
|
||||
return __ptr;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static __inline void
|
||||
_mm_free (void * ptr)
|
||||
{
|
||||
free (ptr);
|
||||
}
|
||||
static __inline void _mm_free(void *__ptr) { free(__ptr); }
|
||||
|
||||
#else
|
||||
#include_next <mm_malloc.h>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -32,7 +32,8 @@
|
|||
In the specific case of the monitor and mwait instructions there are
|
||||
no direct equivalent in the PowerISA at this time. So those
|
||||
intrinsics are not implemented. */
|
||||
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
||||
#error \
|
||||
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
||||
#endif
|
||||
|
||||
#ifndef PMMINTRIN_H_
|
||||
|
@ -43,106 +44,94 @@
|
|||
/* We need definitions from the SSE2 and SSE header files*/
|
||||
#include <emmintrin.h>
|
||||
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_addsub_ps (__m128 __X, __m128 __Y)
|
||||
{
|
||||
const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
|
||||
__v4sf even_neg_Y = vec_xor(__Y, even_n0);
|
||||
return (__m128) vec_add (__X, even_neg_Y);
|
||||
extern __inline __m128
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_addsub_ps(__m128 __X, __m128 __Y) {
|
||||
const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
|
||||
__v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
|
||||
return (__m128)vec_add(__X, __even_neg_Y);
|
||||
}
|
||||
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_addsub_pd (__m128d __X, __m128d __Y)
|
||||
{
|
||||
const __v2df even_n0 = {-0.0, 0.0};
|
||||
__v2df even_neg_Y = vec_xor(__Y, even_n0);
|
||||
return (__m128d) vec_add (__X, even_neg_Y);
|
||||
extern __inline __m128d
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_addsub_pd(__m128d __X, __m128d __Y) {
|
||||
const __v2df __even_n0 = {-0.0, 0.0};
|
||||
__v2df __even_neg_Y = vec_xor(__Y, __even_n0);
|
||||
return (__m128d)vec_add(__X, __even_neg_Y);
|
||||
}
|
||||
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_ps (__m128 __X, __m128 __Y)
|
||||
{
|
||||
__vector unsigned char xform2 = {
|
||||
0x00, 0x01, 0x02, 0x03,
|
||||
0x08, 0x09, 0x0A, 0x0B,
|
||||
0x10, 0x11, 0x12, 0x13,
|
||||
0x18, 0x19, 0x1A, 0x1B
|
||||
};
|
||||
__vector unsigned char xform1 = {
|
||||
0x04, 0x05, 0x06, 0x07,
|
||||
0x0C, 0x0D, 0x0E, 0x0F,
|
||||
0x14, 0x15, 0x16, 0x17,
|
||||
0x1C, 0x1D, 0x1E, 0x1F
|
||||
};
|
||||
return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
||||
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
||||
extern __inline __m128
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_ps(__m128 __X, __m128 __Y) {
|
||||
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
|
||||
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
|
||||
0x18, 0x19, 0x1A, 0x1B};
|
||||
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
|
||||
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
|
||||
0x1C, 0x1D, 0x1E, 0x1F};
|
||||
return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
|
||||
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
|
||||
}
|
||||
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_ps (__m128 __X, __m128 __Y)
|
||||
{
|
||||
__vector unsigned char xform2 = {
|
||||
0x00, 0x01, 0x02, 0x03,
|
||||
0x08, 0x09, 0x0A, 0x0B,
|
||||
0x10, 0x11, 0x12, 0x13,
|
||||
0x18, 0x19, 0x1A, 0x1B
|
||||
};
|
||||
__vector unsigned char xform1 = {
|
||||
0x04, 0x05, 0x06, 0x07,
|
||||
0x0C, 0x0D, 0x0E, 0x0F,
|
||||
0x14, 0x15, 0x16, 0x17,
|
||||
0x1C, 0x1D, 0x1E, 0x1F
|
||||
};
|
||||
return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
||||
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
||||
extern __inline __m128
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_ps(__m128 __X, __m128 __Y) {
|
||||
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
|
||||
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
|
||||
0x18, 0x19, 0x1A, 0x1B};
|
||||
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
|
||||
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
|
||||
0x1C, 0x1D, 0x1E, 0x1F};
|
||||
return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
|
||||
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
|
||||
}
|
||||
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_pd (__m128d __X, __m128d __Y)
|
||||
{
|
||||
return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
||||
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
||||
extern __inline __m128d
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_pd(__m128d __X, __m128d __Y) {
|
||||
return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y),
|
||||
vec_mergel((__v2df)__X, (__v2df)__Y));
|
||||
}
|
||||
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_pd (__m128d __X, __m128d __Y)
|
||||
{
|
||||
return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
||||
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
||||
extern __inline __m128d
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_pd(__m128d __X, __m128d __Y) {
|
||||
return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y),
|
||||
vec_mergel((__v2df)__X, (__v2df)__Y));
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movehdup_ps (__m128 __X)
|
||||
{
|
||||
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
|
||||
extern __inline __m128
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movehdup_ps(__m128 __X) {
|
||||
return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_moveldup_ps (__m128 __X)
|
||||
{
|
||||
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
|
||||
extern __inline __m128
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_moveldup_ps(__m128 __X) {
|
||||
return (__m128)vec_mergee((__v4su)__X, (__v4su)__X);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_loaddup_pd (double const *__P)
|
||||
{
|
||||
return (__m128d) vec_splats (*__P);
|
||||
extern __inline __m128d
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_loaddup_pd(double const *__P) {
|
||||
return (__m128d)vec_splats(*__P);
|
||||
}
|
||||
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movedup_pd (__m128d __X)
|
||||
{
|
||||
return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
|
||||
extern __inline __m128d
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movedup_pd(__m128d __X) {
|
||||
return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
|
||||
}
|
||||
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_lddqu_si128 (__m128i const *__P)
|
||||
{
|
||||
return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_lddqu_si128(__m128i const *__P) {
|
||||
return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
|
||||
}
|
||||
|
||||
/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -33,471 +33,415 @@
|
|||
#include <pmmintrin.h>
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_epi16 (__m128i __A)
|
||||
{
|
||||
return (__m128i) vec_abs ((__v8hi) __A);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_epi16(__m128i __A) {
|
||||
return (__m128i)vec_abs((__v8hi)__A);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_epi32 (__m128i __A)
|
||||
{
|
||||
return (__m128i) vec_abs ((__v4si) __A);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_epi32(__m128i __A) {
|
||||
return (__m128i)vec_abs((__v4si)__A);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_epi8 (__m128i __A)
|
||||
{
|
||||
return (__m128i) vec_abs ((__v16qi) __A);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_epi8(__m128i __A) {
|
||||
return (__m128i)vec_abs((__v16qi)__A);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_pi16 (__m64 __A)
|
||||
{
|
||||
__v8hi __B = (__v8hi) (__v2du) { __A, __A };
|
||||
return (__m64) ((__v2du) vec_abs (__B))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_pi16(__m64 __A) {
|
||||
__v8hi __B = (__v8hi)(__v2du){__A, __A};
|
||||
return (__m64)((__v2du)vec_abs(__B))[0];
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_pi32 (__m64 __A)
|
||||
{
|
||||
__v4si __B = (__v4si) (__v2du) { __A, __A };
|
||||
return (__m64) ((__v2du) vec_abs (__B))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_pi32(__m64 __A) {
|
||||
__v4si __B = (__v4si)(__v2du){__A, __A};
|
||||
return (__m64)((__v2du)vec_abs(__B))[0];
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_pi8 (__m64 __A)
|
||||
{
|
||||
__v16qi __B = (__v16qi) (__v2du) { __A, __A };
|
||||
return (__m64) ((__v2du) vec_abs (__B))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_pi8(__m64 __A) {
|
||||
__v16qi __B = (__v16qi)(__v2du){__A, __A};
|
||||
return (__m64)((__v2du)vec_abs(__B))[0];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
|
||||
{
|
||||
if (__builtin_constant_p (__count) && __count < 16)
|
||||
{
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
|
||||
if (__builtin_constant_p(__count) && __count < 16) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_reve ((__v16qu) __A);
|
||||
__B = (__m128i) vec_reve ((__v16qu) __B);
|
||||
__A = (__m128i)vec_reve((__v16qu)__A);
|
||||
__B = (__m128i)vec_reve((__v16qu)__B);
|
||||
#endif
|
||||
__A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
|
||||
__A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_reve ((__v16qu) __A);
|
||||
__A = (__m128i)vec_reve((__v16qu)__A);
|
||||
#endif
|
||||
return __A;
|
||||
}
|
||||
return __A;
|
||||
}
|
||||
|
||||
if (__count == 0)
|
||||
return __B;
|
||||
|
||||
if (__count >= 16)
|
||||
{
|
||||
if (__count >= 32)
|
||||
{
|
||||
const __v16qu zero = { 0 };
|
||||
return (__m128i) zero;
|
||||
}
|
||||
else
|
||||
{
|
||||
const __v16qu __shift =
|
||||
vec_splats ((unsigned char) ((__count - 16) * 8));
|
||||
if (__count >= 16) {
|
||||
if (__count >= 32) {
|
||||
const __v16qu __zero = {0};
|
||||
return (__m128i)__zero;
|
||||
} else {
|
||||
const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return (__m128i) vec_sro ((__v16qu) __A, __shift);
|
||||
return (__m128i)vec_sro((__v16qu)__A, __shift);
|
||||
#else
|
||||
return (__m128i) vec_slo ((__v16qu) __A, __shift);
|
||||
return (__m128i)vec_slo((__v16qu)__A, __shift);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const __v16qu __shiftA =
|
||||
vec_splats ((unsigned char) ((16 - __count) * 8));
|
||||
const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
|
||||
} else {
|
||||
const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
|
||||
const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
|
||||
__B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
|
||||
__A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
|
||||
__B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
|
||||
#else
|
||||
__A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
|
||||
__B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
|
||||
__A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
|
||||
__B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
|
||||
#endif
|
||||
return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
|
||||
}
|
||||
return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
|
||||
}
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
|
||||
{
|
||||
if (__count < 16)
|
||||
{
|
||||
__v2du __C = { __B, __A };
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
|
||||
if (__count < 16) {
|
||||
__v2du __C = {__B, __A};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
const __v4su __shift = { __count << 3, 0, 0, 0 };
|
||||
__C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
|
||||
const __v4su __shift = {__count << 3, 0, 0, 0};
|
||||
__C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
|
||||
#else
|
||||
const __v4su __shift = { 0, 0, 0, __count << 3 };
|
||||
__C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
|
||||
const __v4su __shift = {0, 0, 0, __count << 3};
|
||||
__C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
|
||||
#endif
|
||||
return (__m64) __C[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
const __m64 __zero = { 0 };
|
||||
return __zero;
|
||||
}
|
||||
return (__m64)__C[0];
|
||||
} else {
|
||||
const __m64 __zero = {0};
|
||||
return __zero;
|
||||
}
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
const __v16qu __Q =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
|
||||
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
|
||||
return (__m128i) vec_add (__C, __D);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_epi16(__m128i __A, __m128i __B) {
|
||||
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||
16, 17, 20, 21, 24, 25, 28, 29};
|
||||
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
|
||||
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
|
||||
return (__m128i)vec_add(__C, __D);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_epi32 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
|
||||
const __v16qu __Q =
|
||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
|
||||
__v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
|
||||
__v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
|
||||
return (__m128i) vec_add (__C, __D);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_epi32(__m128i __A, __m128i __B) {
|
||||
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
|
||||
16, 17, 18, 19, 24, 25, 26, 27};
|
||||
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
|
||||
20, 21, 22, 23, 28, 29, 30, 31};
|
||||
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
|
||||
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
|
||||
return (__m128i)vec_add(__C, __D);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
|
||||
const __v16qu __Q =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
|
||||
__v8hi __D = vec_perm (__C, __C, __Q);
|
||||
__C = vec_perm (__C, __C, __P);
|
||||
__C = vec_add (__C, __D);
|
||||
return (__m64) ((__v2du) __C)[1];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_pi16(__m64 __A, __m64 __B) {
|
||||
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
||||
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
|
||||
__v8hi __D = vec_perm(__C, __C, __Q);
|
||||
__C = vec_perm(__C, __C, __P);
|
||||
__C = vec_add(__C, __D);
|
||||
return (__m64)((__v2du)__C)[1];
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_pi32 (__m64 __A, __m64 __B)
|
||||
{
|
||||
__v4si __C = (__v4si) (__v2du) { __A, __B };
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
|
||||
const __v16qu __Q =
|
||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||
__v4si __D = vec_perm (__C, __C, __Q);
|
||||
__C = vec_perm (__C, __C, __P);
|
||||
__C = vec_add (__C, __D);
|
||||
return (__m64) ((__v2du) __C)[1];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadd_pi32(__m64 __A, __m64 __B) {
|
||||
__v4si __C = (__v4si)(__v2du){__A, __B};
|
||||
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
|
||||
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
|
||||
__v4si __D = vec_perm(__C, __C, __Q);
|
||||
__C = vec_perm(__C, __C, __P);
|
||||
__C = vec_add(__C, __D);
|
||||
return (__m64)((__v2du)__C)[1];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadds_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
__v4si __C = { 0 }, __D = { 0 };
|
||||
__C = vec_sum4s ((__v8hi) __A, __C);
|
||||
__D = vec_sum4s ((__v8hi) __B, __D);
|
||||
__C = (__v4si) vec_packs (__C, __D);
|
||||
return (__m128i) __C;
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadds_epi16(__m128i __A, __m128i __B) {
|
||||
__v4si __C = {0}, __D = {0};
|
||||
__C = vec_sum4s((__v8hi)__A, __C);
|
||||
__D = vec_sum4s((__v8hi)__B, __D);
|
||||
__C = (__v4si)vec_packs(__C, __D);
|
||||
return (__m128i)__C;
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadds_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v4si __zero = { 0 };
|
||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
||||
__v4si __D = vec_sum4s (__C, __zero);
|
||||
__C = vec_packs (__D, __D);
|
||||
return (__m64) ((__v2du) __C)[1];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hadds_pi16(__m64 __A, __m64 __B) {
|
||||
const __v4si __zero = {0};
|
||||
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||
__v4si __D = vec_sum4s(__C, __zero);
|
||||
__C = vec_packs(__D, __D);
|
||||
return (__m64)((__v2du)__C)[1];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
const __v16qu __Q =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
|
||||
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
|
||||
return (__m128i) vec_sub (__C, __D);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_epi16(__m128i __A, __m128i __B) {
|
||||
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||
16, 17, 20, 21, 24, 25, 28, 29};
|
||||
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
|
||||
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
|
||||
return (__m128i)vec_sub(__C, __D);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_epi32 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
|
||||
const __v16qu __Q =
|
||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
|
||||
__v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
|
||||
__v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
|
||||
return (__m128i) vec_sub (__C, __D);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_epi32(__m128i __A, __m128i __B) {
|
||||
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
|
||||
16, 17, 18, 19, 24, 25, 26, 27};
|
||||
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
|
||||
20, 21, 22, 23, 28, 29, 30, 31};
|
||||
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
|
||||
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
|
||||
return (__m128i)vec_sub(__C, __D);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
|
||||
const __v16qu __Q =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
|
||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
||||
__v8hi __D = vec_perm (__C, __C, __Q);
|
||||
__C = vec_perm (__C, __C, __P);
|
||||
__C = vec_sub (__C, __D);
|
||||
return (__m64) ((__v2du) __C)[1];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_pi16(__m64 __A, __m64 __B) {
|
||||
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
||||
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
|
||||
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||
__v8hi __D = vec_perm(__C, __C, __Q);
|
||||
__C = vec_perm(__C, __C, __P);
|
||||
__C = vec_sub(__C, __D);
|
||||
return (__m64)((__v2du)__C)[1];
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_pi32 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
|
||||
const __v16qu __Q =
|
||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||
__v4si __C = (__v4si) (__v2du) { __A, __B };
|
||||
__v4si __D = vec_perm (__C, __C, __Q);
|
||||
__C = vec_perm (__C, __C, __P);
|
||||
__C = vec_sub (__C, __D);
|
||||
return (__m64) ((__v2du) __C)[1];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsub_pi32(__m64 __A, __m64 __B) {
|
||||
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
|
||||
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
|
||||
__v4si __C = (__v4si)(__v2du){__A, __B};
|
||||
__v4si __D = vec_perm(__C, __C, __Q);
|
||||
__C = vec_perm(__C, __C, __P);
|
||||
__C = vec_sub(__C, __D);
|
||||
return (__m64)((__v2du)__C)[1];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsubs_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
const __v16qu __Q =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
|
||||
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
|
||||
return (__m128i) vec_subs (__C, __D);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsubs_epi16(__m128i __A, __m128i __B) {
|
||||
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||
16, 17, 20, 21, 24, 25, 28, 29};
|
||||
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
|
||||
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
|
||||
return (__m128i)vec_subs(__C, __D);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsubs_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v16qu __P =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
|
||||
const __v16qu __Q =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
|
||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
||||
__v8hi __D = vec_perm (__C, __C, __P);
|
||||
__v8hi __E = vec_perm (__C, __C, __Q);
|
||||
__C = vec_subs (__D, __E);
|
||||
return (__m64) ((__v2du) __C)[1];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_hsubs_pi16(__m64 __A, __m64 __B) {
|
||||
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
||||
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
|
||||
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||
__v8hi __D = vec_perm(__C, __C, __P);
|
||||
__v8hi __E = vec_perm(__C, __C, __Q);
|
||||
__C = vec_subs(__D, __E);
|
||||
return (__m64)((__v2du)__C)[1];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_shuffle_epi8 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qi __zero = { 0 };
|
||||
__vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
|
||||
__v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
|
||||
return (__m128i) vec_sel (__C, __zero, __select);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_shuffle_epi8(__m128i __A, __m128i __B) {
|
||||
const __v16qi __zero = {0};
|
||||
__vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
|
||||
__v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
|
||||
return (__m128i)vec_sel(__C, __zero, __select);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_shuffle_pi8 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v16qi __zero = { 0 };
|
||||
__v16qi __C = (__v16qi) (__v2du) { __A, __A };
|
||||
__v16qi __D = (__v16qi) (__v2du) { __B, __B };
|
||||
__vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
|
||||
__C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
|
||||
__C = vec_sel (__C, __zero, __select);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_shuffle_pi8(__m64 __A, __m64 __B) {
|
||||
const __v16qi __zero = {0};
|
||||
__v16qi __C = (__v16qi)(__v2du){__A, __A};
|
||||
__v16qi __D = (__v16qi)(__v2du){__B, __B};
|
||||
__vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
|
||||
__C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
|
||||
__C = vec_sel(__C, __zero, __select);
|
||||
return (__m64)((__v2du)(__C))[0];
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi8 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v16qi __zero = { 0 };
|
||||
__v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi8(__m128i __A, __m128i __B) {
|
||||
const __v16qi __zero = {0};
|
||||
__v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
|
||||
__v16qi __selectpos =
|
||||
(__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
|
||||
__v16qi __conv = vec_add (__selectneg, __selectpos);
|
||||
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
|
||||
(__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
|
||||
__v16qi __conv = vec_add(__selectneg, __selectpos);
|
||||
return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v8hi __zero = { 0 };
|
||||
__v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
|
||||
__v8hi __selectpos =
|
||||
(__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
|
||||
__v8hi __conv = vec_add (__selectneg, __selectpos);
|
||||
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi16(__m128i __A, __m128i __B) {
|
||||
const __v8hi __zero = {0};
|
||||
__v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
|
||||
__v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
|
||||
__v8hi __conv = vec_add(__selectneg, __selectpos);
|
||||
return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi32 (__m128i __A, __m128i __B)
|
||||
{
|
||||
const __v4si __zero = { 0 };
|
||||
__v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
|
||||
__v4si __selectpos =
|
||||
(__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
|
||||
__v4si __conv = vec_add (__selectneg, __selectpos);
|
||||
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi32(__m128i __A, __m128i __B) {
|
||||
const __v4si __zero = {0};
|
||||
__v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
|
||||
__v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
|
||||
__v4si __conv = vec_add(__selectneg, __selectpos);
|
||||
return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi8 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v16qi __zero = { 0 };
|
||||
__v16qi __C = (__v16qi) (__v2du) { __A, __A };
|
||||
__v16qi __D = (__v16qi) (__v2du) { __B, __B };
|
||||
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi8(__m64 __A, __m64 __B) {
|
||||
const __v16qi __zero = {0};
|
||||
__v16qi __C = (__v16qi)(__v2du){__A, __A};
|
||||
__v16qi __D = (__v16qi)(__v2du){__B, __B};
|
||||
__C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
|
||||
return (__m64)((__v2du)(__C))[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v8hi __zero = { 0 };
|
||||
__v8hi __C = (__v8hi) (__v2du) { __A, __A };
|
||||
__v8hi __D = (__v8hi) (__v2du) { __B, __B };
|
||||
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi16(__m64 __A, __m64 __B) {
|
||||
const __v8hi __zero = {0};
|
||||
__v8hi __C = (__v8hi)(__v2du){__A, __A};
|
||||
__v8hi __D = (__v8hi)(__v2du){__B, __B};
|
||||
__C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
|
||||
return (__m64)((__v2du)(__C))[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi32 (__m64 __A, __m64 __B)
|
||||
{
|
||||
const __v4si __zero = { 0 };
|
||||
__v4si __C = (__v4si) (__v2du) { __A, __A };
|
||||
__v4si __D = (__v4si) (__v2du) { __B, __B };
|
||||
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi32(__m64 __A, __m64 __B) {
|
||||
const __v4si __zero = {0};
|
||||
__v4si __C = (__v4si)(__v2du){__A, __A};
|
||||
__v4si __D = (__v4si)(__v2du){__B, __B};
|
||||
__C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
|
||||
return (__m64)((__v2du)(__C))[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maddubs_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
__v8hi __unsigned = vec_splats ((signed short) 0x00ff);
|
||||
__v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
|
||||
__v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
|
||||
__v8hi __E = vec_unpackh ((__v16qi) __B);
|
||||
__v8hi __F = vec_unpackl ((__v16qi) __B);
|
||||
__C = vec_mul (__C, __E);
|
||||
__D = vec_mul (__D, __F);
|
||||
const __v16qu __odds =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
const __v16qu __evens =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
__E = vec_perm (__C, __D, __odds);
|
||||
__F = vec_perm (__C, __D, __evens);
|
||||
return (__m128i) vec_adds (__E, __F);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maddubs_epi16(__m128i __A, __m128i __B) {
|
||||
__v8hi __unsigned = vec_splats((signed short)0x00ff);
|
||||
__v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
|
||||
__v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
|
||||
__v8hi __E = vec_unpackh((__v16qi)__B);
|
||||
__v8hi __F = vec_unpackl((__v16qi)__B);
|
||||
__C = vec_mul(__C, __E);
|
||||
__D = vec_mul(__D, __F);
|
||||
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||
16, 17, 20, 21, 24, 25, 28, 29};
|
||||
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
__E = vec_perm(__C, __D, __odds);
|
||||
__F = vec_perm(__C, __D, __evens);
|
||||
return (__m128i)vec_adds(__E, __F);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maddubs_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
__v8hi __C = (__v8hi) (__v2du) { __A, __A };
|
||||
__C = vec_unpackl ((__v16qi) __C);
|
||||
const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
|
||||
__C = vec_and (__C, __unsigned);
|
||||
__v8hi __D = (__v8hi) (__v2du) { __B, __B };
|
||||
__D = vec_unpackl ((__v16qi) __D);
|
||||
__D = vec_mul (__C, __D);
|
||||
const __v16qu __odds =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
const __v16qu __evens =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
__C = vec_perm (__D, __D, __odds);
|
||||
__D = vec_perm (__D, __D, __evens);
|
||||
__C = vec_adds (__C, __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maddubs_pi16(__m64 __A, __m64 __B) {
|
||||
__v8hi __C = (__v8hi)(__v2du){__A, __A};
|
||||
__C = vec_unpackl((__v16qi)__C);
|
||||
const __v8hi __unsigned = vec_splats((signed short)0x00ff);
|
||||
__C = vec_and(__C, __unsigned);
|
||||
__v8hi __D = (__v8hi)(__v2du){__B, __B};
|
||||
__D = vec_unpackl((__v16qi)__D);
|
||||
__D = vec_mul(__C, __D);
|
||||
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||
16, 17, 20, 21, 24, 25, 28, 29};
|
||||
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
__C = vec_perm(__D, __D, __odds);
|
||||
__D = vec_perm(__D, __D, __evens);
|
||||
__C = vec_adds(__C, __D);
|
||||
return (__m64)((__v2du)(__C))[0];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
|
||||
{
|
||||
__v4si __C = vec_unpackh ((__v8hi) __A);
|
||||
__v4si __D = vec_unpackh ((__v8hi) __B);
|
||||
__C = vec_mul (__C, __D);
|
||||
__D = vec_unpackl ((__v8hi) __A);
|
||||
__v4si __E = vec_unpackl ((__v8hi) __B);
|
||||
__D = vec_mul (__D, __E);
|
||||
const __v4su __shift = vec_splats ((unsigned int) 14);
|
||||
__C = vec_sr (__C, __shift);
|
||||
__D = vec_sr (__D, __shift);
|
||||
const __v4si __ones = vec_splats ((signed int) 1);
|
||||
__C = vec_add (__C, __ones);
|
||||
__C = vec_sr (__C, (__v4su) __ones);
|
||||
__D = vec_add (__D, __ones);
|
||||
__D = vec_sr (__D, (__v4su) __ones);
|
||||
return (__m128i) vec_pack (__C, __D);
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mulhrs_epi16(__m128i __A, __m128i __B) {
|
||||
__v4si __C = vec_unpackh((__v8hi)__A);
|
||||
__v4si __D = vec_unpackh((__v8hi)__B);
|
||||
__C = vec_mul(__C, __D);
|
||||
__D = vec_unpackl((__v8hi)__A);
|
||||
__v4si __E = vec_unpackl((__v8hi)__B);
|
||||
__D = vec_mul(__D, __E);
|
||||
const __v4su __shift = vec_splats((unsigned int)14);
|
||||
__C = vec_sr(__C, __shift);
|
||||
__D = vec_sr(__D, __shift);
|
||||
const __v4si __ones = vec_splats((signed int)1);
|
||||
__C = vec_add(__C, __ones);
|
||||
__C = vec_sr(__C, (__v4su)__ones);
|
||||
__D = vec_add(__D, __ones);
|
||||
__D = vec_sr(__D, (__v4su)__ones);
|
||||
return (__m128i)vec_pack(__C, __D);
|
||||
}
|
||||
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
|
||||
{
|
||||
__v4si __C = (__v4si) (__v2du) { __A, __A };
|
||||
__C = vec_unpackh ((__v8hi) __C);
|
||||
__v4si __D = (__v4si) (__v2du) { __B, __B };
|
||||
__D = vec_unpackh ((__v8hi) __D);
|
||||
__C = vec_mul (__C, __D);
|
||||
const __v4su __shift = vec_splats ((unsigned int) 14);
|
||||
__C = vec_sr (__C, __shift);
|
||||
const __v4si __ones = vec_splats ((signed int) 1);
|
||||
__C = vec_add (__C, __ones);
|
||||
__C = vec_sr (__C, (__v4su) __ones);
|
||||
__v8hi __E = vec_pack (__C, __D);
|
||||
return (__m64) ((__v2du) (__E))[0];
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mulhrs_pi16(__m64 __A, __m64 __B) {
|
||||
__v4si __C = (__v4si)(__v2du){__A, __A};
|
||||
__C = vec_unpackh((__v8hi)__C);
|
||||
__v4si __D = (__v4si)(__v2du){__B, __B};
|
||||
__D = vec_unpackh((__v8hi)__D);
|
||||
__C = vec_mul(__C, __D);
|
||||
const __v4su __shift = vec_splats((unsigned int)14);
|
||||
__C = vec_sr(__C, __shift);
|
||||
const __v4si __ones = vec_splats((signed int)1);
|
||||
__C = vec_add(__C, __ones);
|
||||
__C = vec_sr(__C, (__v4su)__ones);
|
||||
__v8hi __E = vec_pack(__C, __D);
|
||||
return (__m64)((__v2du)(__E))[0];
|
||||
}
|
||||
|
||||
#else
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -8,15 +8,15 @@
|
|||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE
|
||||
|
||||
// CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
|
||||
// CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
|
||||
// CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
|
||||
// CHECK-BE-DAG: @_mm_shufflelo_epi16.permute_selectors = internal constant [4 x i16] [i16 1, i16 515, i16 1029, i16 1543], align 2
|
||||
// CHECK-BE-DAG: @_mm_movemask_pd.__perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
|
||||
// CHECK-BE-DAG: @_mm_shuffle_epi32.__permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
|
||||
// CHECK-BE-DAG: @_mm_shufflehi_epi16.__permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
|
||||
// CHECK-BE-DAG: @_mm_shufflelo_epi16.__permute_selectors = internal constant [4 x i16] [i16 1, i16 515, i16 1029, i16 1543], align 2
|
||||
|
||||
// CHECK-LE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144>, align 16
|
||||
// CHECK-LE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
|
||||
// CHECK-LE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
|
||||
// CHECK-LE-DAG: @_mm_shufflelo_epi16.permute_selectors = internal constant [4 x i16] [i16 256, i16 770, i16 1284, i16 1798], align 2
|
||||
// CHECK-LE-DAG: @_mm_movemask_pd.__perm_mask = internal constant <4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144>, align 16
|
||||
// CHECK-LE-DAG: @_mm_shuffle_epi32.__permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
|
||||
// CHECK-LE-DAG: @_mm_shufflehi_epi16.__permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
|
||||
// CHECK-LE-DAG: @_mm_shufflelo_epi16.__permute_selectors = internal constant [4 x i16] [i16 256, i16 770, i16 1284, i16 1798], align 2
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@ -1008,14 +1008,14 @@ test_shuffle() {
|
|||
// CHECK: %[[SHR:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
|
||||
// CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR]], 3
|
||||
// CHECK: sext i32 %[[AND4]] to i64
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
|
||||
// CHECK: call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])
|
||||
|
||||
|
@ -1046,7 +1046,7 @@ test_shuffle() {
|
|||
// CHECK: sext i32 %[[AND4]] to i64
|
||||
// CHECK-LE: store <2 x i64> <i64 1663540288323457296, i64 0>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK-BE: store <2 x i64> <i64 1157726452361532951, i64 0>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflehi_epi16.permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
|
||||
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
|
||||
// CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_shufflelo_epi16
|
||||
|
@ -1063,7 +1063,7 @@ test_shuffle() {
|
|||
// CHECK: sext i32 %[[AND4]] to i64
|
||||
// CHECK-LE: store <2 x i64> <i64 0, i64 2242261671028070680>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK-BE: store <2 x i64> <i64 0, i64 1736447835066146335>, <2 x i64>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflelo_epi16.permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
|
||||
// CHECK-COUNT-4: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
|
||||
// CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
|
||||
|
||||
void __attribute__((noinline))
|
||||
|
|
|
@ -29,11 +29,11 @@ float fs[4];
|
|||
int i, i2;
|
||||
long long i64;
|
||||
|
||||
// CHECK-LE-DAG: @_mm_shuffle_pi16.permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
|
||||
// CHECK-BE-DAG: @_mm_shuffle_pi16.permute_selectors = internal constant [4 x i16] [i16 1543, i16 1029, i16 515, i16 1], align 2
|
||||
// CHECK-LE-DAG: @_mm_shuffle_pi16.__permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
|
||||
// CHECK-BE-DAG: @_mm_shuffle_pi16.__permute_selectors = internal constant [4 x i16] [i16 1543, i16 1029, i16 515, i16 1], align 2
|
||||
|
||||
// CHECK-LE-DAG: @_mm_shuffle_ps.permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
|
||||
// CHECK-BE-DAG: @_mm_shuffle_ps.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
|
||||
// CHECK-LE-DAG: @_mm_shuffle_ps.__permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
|
||||
// CHECK-BE-DAG: @_mm_shuffle_ps.__permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_add() {
|
||||
|
@ -887,16 +887,16 @@ test_shuffle() {
|
|||
// CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
|
||||
// CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
|
||||
// CHECK: sext i32 %[[AND4]] to i64
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
|
||||
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
|
||||
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
|
||||
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK-LE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
|
||||
// CHECK-BE: getelementptr inbounds [4 x i16], [4 x i16]* %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
|
||||
// CHECK: call <2 x i64> @vec_splats(unsigned long long)
|
||||
|
@ -916,14 +916,14 @@ test_shuffle() {
|
|||
// CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
|
||||
// CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
|
||||
// CHECK: sext i32 %[[AND4]] to i64
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
|
||||
// CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64
|
||||
// CHECK: getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.__permute_selectors, i64 0, i64
|
||||
// CHECK: %[[ADD2:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
|
||||
// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD2]], i32 3
|
||||
// CHECK: call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])
|
||||
|
|
Loading…
Reference in New Issue