2013-01-26 19:14:32 +08:00
|
|
|
; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
|
|
|
|
; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
|
2014-05-29 09:42:45 +08:00
|
|
|
; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
|
2006-03-23 05:39:25 +08:00
|
|
|
|
2008-04-05 08:30:36 +08:00
|
|
|
define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
|
2008-02-21 15:42:26 +08:00
|
|
|
%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1]
|
|
|
|
%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1]
|
|
|
|
%tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1]
|
|
|
|
%tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3 ; <<4 x float>> [#uses=1]
|
|
|
|
%tmp8 = load <4 x float>* %Q ; <<4 x float>> [#uses=1]
|
2009-06-05 06:49:04 +08:00
|
|
|
%tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1]
|
2006-03-23 05:39:25 +08:00
|
|
|
store <4 x float> %tmp10, <4 x float>* %P
|
|
|
|
ret void
|
2013-01-26 19:14:32 +08:00
|
|
|
|
2013-07-14 04:38:47 +08:00
|
|
|
; SSE2-LABEL: test_v4sf:
|
2013-01-26 19:14:32 +08:00
|
|
|
; SSE2: pshufd $0
|
|
|
|
|
2013-07-14 04:38:47 +08:00
|
|
|
; SSE3-LABEL: test_v4sf:
|
2013-01-26 19:14:32 +08:00
|
|
|
; SSE3: pshufd $0
|
2006-03-23 05:39:25 +08:00
|
|
|
}
|
|
|
|
|
2008-04-05 08:30:36 +08:00
|
|
|
define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
|
2008-02-21 15:42:26 +08:00
|
|
|
%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1]
|
|
|
|
%tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1]
|
|
|
|
%tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1]
|
2009-06-05 06:49:04 +08:00
|
|
|
%tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1]
|
2006-03-23 05:39:25 +08:00
|
|
|
store <2 x double> %tmp6, <2 x double>* %P
|
|
|
|
ret void
|
2013-01-26 19:14:32 +08:00
|
|
|
|
2013-07-14 04:38:47 +08:00
|
|
|
; SSE2-LABEL: test_v2sd:
|
2013-01-26 19:14:32 +08:00
|
|
|
; SSE2: shufpd $0
|
|
|
|
|
2013-07-14 04:38:47 +08:00
|
|
|
; SSE3-LABEL: test_v2sd:
|
2013-01-26 19:14:32 +08:00
|
|
|
; SSE3: movddup
|
2006-03-23 05:39:25 +08:00
|
|
|
}
|
[IR] Make {extract,insert}element accept an index of any integer type.
Given the following C code llvm currently generates suboptimal code for
x86-64:
__m128 bss4( const __m128 *ptr, size_t i, size_t j )
{
float f = ptr[i][j];
return (__m128) { f, f, f, f };
}
=================================================
define <4 x float> @_Z4bss4PKDv4_fmm(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) #0 {
%a1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
%a2 = load <4 x float>* %a1, align 16, !tbaa !1
%a3 = trunc i64 %j to i32
%a4 = extractelement <4 x float> %a2, i32 %a3
%a5 = insertelement <4 x float> undef, float %a4, i32 0
%a6 = insertelement <4 x float> %a5, float %a4, i32 1
%a7 = insertelement <4 x float> %a6, float %a4, i32 2
%a8 = insertelement <4 x float> %a7, float %a4, i32 3
ret <4 x float> %a8
}
=================================================
shlq $4, %rsi
addq %rdi, %rsi
movslq %edx, %rax
vbroadcastss (%rsi,%rax,4), %xmm0
retq
=================================================
The movslq is uneeded, but is present because of the trunc to i32 and then
sext back to i64 that the backend adds for vbroadcastss.
We can't remove it because it changes the meaning. The IR that clang
generates is already suboptimal. What clang really should emit is:
%a4 = extractelement <4 x float> %a2, i64 %j
This patch makes that legal. A separate patch will teach clang to do it.
Differential Revision: http://reviews.llvm.org/D3519
llvm-svn: 207801
2014-05-02 06:12:39 +08:00
|
|
|
|
|
|
|
; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
|
|
|
|
define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
|
2014-05-29 09:42:45 +08:00
|
|
|
%1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
|
|
|
|
%2 = load <4 x float>* %1, align 16
|
|
|
|
%3 = trunc i64 %j to i32
|
|
|
|
%4 = extractelement <4 x float> %2, i32 %3
|
|
|
|
%5 = insertelement <4 x float> undef, float %4, i32 0
|
|
|
|
%6 = insertelement <4 x float> %5, float %4, i32 1
|
|
|
|
%7 = insertelement <4 x float> %6, float %4, i32 2
|
|
|
|
%8 = insertelement <4 x float> %7, float %4, i32 3
|
|
|
|
ret <4 x float> %8
|
|
|
|
|
|
|
|
; AVX-LABEL: load_extract_splat
|
|
|
|
; AVX-NOT: rsp
|
|
|
|
; AVX: vbroadcastss
|
|
|
|
}
|
|
|
|
|
|
|
|
; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
|
|
|
|
define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
|
[IR] Make {extract,insert}element accept an index of any integer type.
Given the following C code llvm currently generates suboptimal code for
x86-64:
__m128 bss4( const __m128 *ptr, size_t i, size_t j )
{
float f = ptr[i][j];
return (__m128) { f, f, f, f };
}
=================================================
define <4 x float> @_Z4bss4PKDv4_fmm(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) #0 {
%a1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
%a2 = load <4 x float>* %a1, align 16, !tbaa !1
%a3 = trunc i64 %j to i32
%a4 = extractelement <4 x float> %a2, i32 %a3
%a5 = insertelement <4 x float> undef, float %a4, i32 0
%a6 = insertelement <4 x float> %a5, float %a4, i32 1
%a7 = insertelement <4 x float> %a6, float %a4, i32 2
%a8 = insertelement <4 x float> %a7, float %a4, i32 3
ret <4 x float> %a8
}
=================================================
shlq $4, %rsi
addq %rdi, %rsi
movslq %edx, %rax
vbroadcastss (%rsi,%rax,4), %xmm0
retq
=================================================
The movslq is uneeded, but is present because of the trunc to i32 and then
sext back to i64 that the backend adds for vbroadcastss.
We can't remove it because it changes the meaning. The IR that clang
generates is already suboptimal. What clang really should emit is:
%a4 = extractelement <4 x float> %a2, i64 %j
This patch makes that legal. A separate patch will teach clang to do it.
Differential Revision: http://reviews.llvm.org/D3519
llvm-svn: 207801
2014-05-02 06:12:39 +08:00
|
|
|
%1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
|
|
|
|
%2 = load <4 x float>* %1, align 16
|
|
|
|
%3 = extractelement <4 x float> %2, i64 %j
|
|
|
|
%4 = insertelement <4 x float> undef, float %3, i32 0
|
|
|
|
%5 = insertelement <4 x float> %4, float %3, i32 1
|
|
|
|
%6 = insertelement <4 x float> %5, float %3, i32 2
|
|
|
|
%7 = insertelement <4 x float> %6, float %3, i32 3
|
|
|
|
ret <4 x float> %7
|
|
|
|
|
2014-05-29 09:42:45 +08:00
|
|
|
; AVX-LABEL: load_extract_splat1
|
[IR] Make {extract,insert}element accept an index of any integer type.
Given the following C code llvm currently generates suboptimal code for
x86-64:
__m128 bss4( const __m128 *ptr, size_t i, size_t j )
{
float f = ptr[i][j];
return (__m128) { f, f, f, f };
}
=================================================
define <4 x float> @_Z4bss4PKDv4_fmm(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) #0 {
%a1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
%a2 = load <4 x float>* %a1, align 16, !tbaa !1
%a3 = trunc i64 %j to i32
%a4 = extractelement <4 x float> %a2, i32 %a3
%a5 = insertelement <4 x float> undef, float %a4, i32 0
%a6 = insertelement <4 x float> %a5, float %a4, i32 1
%a7 = insertelement <4 x float> %a6, float %a4, i32 2
%a8 = insertelement <4 x float> %a7, float %a4, i32 3
ret <4 x float> %a8
}
=================================================
shlq $4, %rsi
addq %rdi, %rsi
movslq %edx, %rax
vbroadcastss (%rsi,%rax,4), %xmm0
retq
=================================================
The movslq is uneeded, but is present because of the trunc to i32 and then
sext back to i64 that the backend adds for vbroadcastss.
We can't remove it because it changes the meaning. The IR that clang
generates is already suboptimal. What clang really should emit is:
%a4 = extractelement <4 x float> %a2, i64 %j
This patch makes that legal. A separate patch will teach clang to do it.
Differential Revision: http://reviews.llvm.org/D3519
llvm-svn: 207801
2014-05-02 06:12:39 +08:00
|
|
|
; AVX-NOT: movs
|
|
|
|
; AVX: vbroadcastss
|
|
|
|
}
|