forked from OSchip/llvm-project
Tweak *mmintrin.h so that they don't make any bad assumptions about alignment (which probably has little effect in practice, but better to get it right). Make the load in _mm_loadh_pi and _mm_loadl_pi a single LLVM IR instruction to make optimizing easier for CodeGen.
rdar://10054986 llvm-svn: 139874
This commit is contained in:
parent
639222d090
commit
9bb51adcce
|
@ -458,7 +458,11 @@ _mm_load_pd(double const *dp)
|
|||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_load1_pd(double const *dp)
|
||||
{
|
||||
return (__m128d){ dp[0], dp[0] };
|
||||
struct __mm_load1_pd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
double u = ((struct __mm_load1_pd_struct*)dp)->u;
|
||||
return (__m128d){ u, u };
|
||||
}
|
||||
|
||||
#define _mm_load_pd1(dp) _mm_load1_pd(dp)
|
||||
|
@ -466,7 +470,8 @@ _mm_load1_pd(double const *dp)
|
|||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loadr_pd(double const *dp)
|
||||
{
|
||||
return (__m128d){ dp[1], dp[0] };
|
||||
__m128d u = *(__m128d*)dp;
|
||||
return __builtin_shufflevector(u, u, 1, 0);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
|
@ -481,19 +486,31 @@ _mm_loadu_pd(double const *dp)
|
|||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_load_sd(double const *dp)
|
||||
{
|
||||
return (__m128d){ *dp, 0.0 };
|
||||
struct __mm_load_sd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
double u = ((struct __mm_load_sd_struct*)dp)->u;
|
||||
return (__m128d){ u, 0 };
|
||||
}
|
||||
|
||||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loadh_pd(__m128d a, double const *dp)
|
||||
{
|
||||
return (__m128d){ a[0], *dp };
|
||||
struct __mm_loadh_pd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
double u = ((struct __mm_loadh_pd_struct*)dp)->u;
|
||||
return (__m128d){ a[0], u };
|
||||
}
|
||||
|
||||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loadl_pd(__m128d a, double const *dp)
|
||||
{
|
||||
return (__m128d){ *dp, a[1] };
|
||||
struct __mm_loadl_pd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
double u = ((struct __mm_loadl_pd_struct*)dp)->u;
|
||||
return (__m128d){ u, a[1] };
|
||||
}
|
||||
|
||||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
|
@ -535,14 +552,20 @@ _mm_move_sd(__m128d a, __m128d b)
|
|||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_store_sd(double *dp, __m128d a)
|
||||
{
|
||||
dp[0] = a[0];
|
||||
struct __mm_store_sd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
((struct __mm_store_sd_struct*)dp)->u = a[0];
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_store1_pd(double *dp, __m128d a)
|
||||
{
|
||||
dp[0] = a[0];
|
||||
dp[1] = a[0];
|
||||
struct __mm_store1_pd_struct {
|
||||
double u[2];
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
|
||||
((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
|
@ -560,20 +583,26 @@ _mm_storeu_pd(double *dp, __m128d a)
|
|||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_storer_pd(double *dp, __m128d a)
|
||||
{
|
||||
dp[0] = a[1];
|
||||
dp[1] = a[0];
|
||||
a = __builtin_shufflevector(a, a, 1, 0);
|
||||
*(__m128d *)dp = a;
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_storeh_pd(double *dp, __m128d a)
|
||||
{
|
||||
dp[0] = a[1];
|
||||
struct __mm_storeh_pd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
((struct __mm_storeh_pd_struct*)dp)->u = a[1];
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_storel_pd(double *dp, __m128d a)
|
||||
{
|
||||
dp[0] = a[0];
|
||||
struct __mm_storeh_pd_struct {
|
||||
double u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
((struct __mm_storeh_pd_struct*)dp)->u = a[0];
|
||||
}
|
||||
|
||||
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||||
|
@ -1029,7 +1058,10 @@ _mm_loadu_si128(__m128i const *p)
|
|||
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loadl_epi64(__m128i const *p)
|
||||
{
|
||||
return (__m128i) { *(long long*)p, 0};
|
||||
struct __mm_loadl_epi64_struct {
|
||||
long long u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
|
||||
}
|
||||
|
||||
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||||
|
|
|
@ -84,11 +84,7 @@ _mm_hsub_pd(__m128d a, __m128d b)
|
|||
return __builtin_ia32_hsubpd(a, b);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loaddup_pd(double const *dp)
|
||||
{
|
||||
return (__m128d){ *dp, *dp };
|
||||
}
|
||||
#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
|
||||
|
||||
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_movedup_pd(__m128d a)
|
||||
|
|
|
@ -501,31 +501,45 @@ _mm_cvtss_f32(__m128 a)
|
|||
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loadh_pi(__m128 a, const __m64 *p)
|
||||
{
|
||||
__m128 b;
|
||||
b[0] = *(float*)p;
|
||||
b[1] = *((float*)p+1);
|
||||
return __builtin_shufflevector(a, b, 0, 1, 4, 5);
|
||||
typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
|
||||
struct __mm_loadh_pi_struct {
|
||||
__mm_loadh_pi_v2f32 u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
__mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u;
|
||||
__m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
|
||||
return __builtin_shufflevector(a, bb, 0, 1, 4, 5);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_loadl_pi(__m128 a, const __m64 *p)
|
||||
{
|
||||
__m128 b;
|
||||
b[0] = *(float*)p;
|
||||
b[1] = *((float*)p+1);
|
||||
return __builtin_shufflevector(a, b, 4, 5, 2, 3);
|
||||
typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
|
||||
struct __mm_loadl_pi_struct {
|
||||
__mm_loadl_pi_v2f32 u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
__mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u;
|
||||
__m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
|
||||
return __builtin_shufflevector(a, bb, 4, 5, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_load_ss(const float *p)
|
||||
{
|
||||
return (__m128){ *p, 0, 0, 0 };
|
||||
struct __mm_load_ss_struct {
|
||||
float u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
float u = ((struct __mm_load_ss_struct*)p)->u;
|
||||
return (__m128){ u, 0, 0, 0 };
|
||||
}
|
||||
|
||||
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_load1_ps(const float *p)
|
||||
{
|
||||
return (__m128){ *p, *p, *p, *p };
|
||||
struct __mm_load1_ps_struct {
|
||||
float u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
float u = ((struct __mm_load1_ps_struct*)p)->u;
|
||||
return (__m128){ u, u, u, u };
|
||||
}
|
||||
|
||||
#define _mm_load_ps1(p) _mm_load1_ps(p)
|
||||
|
@ -541,7 +555,7 @@ _mm_loadu_ps(const float *p)
|
|||
{
|
||||
struct __loadu_ps {
|
||||
__m128 v;
|
||||
} __attribute__((packed, may_alias));
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
return ((struct __loadu_ps*)p)->v;
|
||||
}
|
||||
|
||||
|
@ -604,7 +618,10 @@ _mm_storel_pi(__m64 *p, __m128 a)
|
|||
static __inline__ void __attribute__((__always_inline__))
|
||||
_mm_store_ss(float *p, __m128 a)
|
||||
{
|
||||
*p = a[0];
|
||||
struct __mm_store_ss_struct {
|
||||
float u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
((struct __mm_store_ss_struct*)p)->u = a[0];
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
// RUN: %clang_cc1 -triple i386-apple-darwin9 -target-cpu pentium4 -target-feature +sse4.1 -g -emit-llvm %s -o - | FileCheck %s
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
__m128 test_loadl_pi(__m128 x, void* y) {
|
||||
// CHECK: define {{.*}} @test_loadl_pi
|
||||
// CHECK: load <2 x float>* {{.*}}, align 1{{$}}
|
||||
// CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
|
||||
// CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
||||
return _mm_loadl_pi(x,y);
|
||||
}
|
||||
|
||||
__m128 test_loadh_pi(__m128 x, void* y) {
|
||||
// CHECK: define {{.*}} @test_loadh_pi
|
||||
// CHECK: load <2 x float>* {{.*}}, align 1{{$}}
|
||||
// CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
|
||||
// CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
return _mm_loadh_pi(x,y);
|
||||
}
|
||||
|
||||
__m128 test_load_ss(void* y) {
|
||||
// CHECK: define {{.*}} @test_load_ss
|
||||
// CHECK: load float* {{.*}}, align 1{{$}}
|
||||
return _mm_load_ss(y);
|
||||
}
|
||||
|
||||
__m128 test_load1_ps(void* y) {
|
||||
// CHECK: define {{.*}} @test_load1_ps
|
||||
// CHECK: load float* {{.*}}, align 1{{$}}
|
||||
return _mm_load1_ps(y);
|
||||
}
|
||||
|
||||
void test_store_ss(__m128 x, void* y) {
|
||||
// CHECK: define void @test_store_ss
|
||||
// CHECK: store {{.*}} float* {{.*}}, align 1,
|
||||
_mm_store_ss(y, x);
|
||||
}
|
||||
|
||||
__m128d test_load1_pd(__m128 x, void* y) {
|
||||
// CHECK: define {{.*}} @test_load1_pd
|
||||
// CHECK: load double* {{.*}}, align 1{{$}}
|
||||
return _mm_load1_pd(y);
|
||||
}
|
||||
|
||||
__m128d test_loadr_pd(__m128 x, void* y) {
|
||||
// CHECK: define {{.*}} @test_loadr_pd
|
||||
// CHECK: load <2 x double>* {{.*}}, align 16{{$}}
|
||||
return _mm_loadr_pd(y);
|
||||
}
|
||||
|
||||
__m128d test_load_sd(void* y) {
|
||||
// CHECK: define {{.*}} @test_load_sd
|
||||
// CHECK: load double* {{.*}}, align 1{{$}}
|
||||
return _mm_load_sd(y);
|
||||
}
|
||||
|
||||
__m128d test_loadh_pd(__m128d x, void* y) {
|
||||
// CHECK: define {{.*}} @test_loadh_pd
|
||||
// CHECK: load double* {{.*}}, align 1{{$}}
|
||||
return _mm_loadh_pd(x, y);
|
||||
}
|
||||
|
||||
__m128d test_loadl_pd(__m128d x, void* y) {
|
||||
// CHECK: define {{.*}} @test_loadl_pd
|
||||
// CHECK: load double* {{.*}}, align 1{{$}}
|
||||
return _mm_loadl_pd(x, y);
|
||||
}
|
||||
|
||||
void test_store_sd(__m128d x, void* y) {
|
||||
// CHECK: define void @test_store_sd
|
||||
// CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
|
||||
_mm_store_sd(y, x);
|
||||
}
|
||||
|
||||
void test_store1_pd(__m128d x, void* y) {
|
||||
// CHECK: define void @test_store1_pd
|
||||
// CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
|
||||
// CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
|
||||
_mm_store1_pd(y, x);
|
||||
}
|
||||
|
||||
void test_storer_pd(__m128d x, void* y) {
|
||||
// CHECK: define void @test_storer_pd
|
||||
// CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}}
|
||||
_mm_storer_pd(y, x);
|
||||
}
|
||||
|
||||
void test_storeh_pd(__m128d x, void* y) {
|
||||
// CHECK: define void @test_storeh_pd
|
||||
// CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
|
||||
_mm_storeh_pd(y, x);
|
||||
}
|
||||
|
||||
void test_storel_pd(__m128d x, void* y) {
|
||||
// CHECK: define void @test_storel_pd
|
||||
// CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
|
||||
_mm_storel_pd(y, x);
|
||||
}
|
||||
|
||||
__m128i test_loadl_epi64(void* y) {
|
||||
// CHECK: define {{.*}} @test_loadl_epi64
|
||||
// CHECK: load i64* {{.*}}, align 1{{$}}
|
||||
return _mm_loadl_epi64(y);
|
||||
}
|
Loading…
Reference in New Issue