2011-02-22 15:21:51 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
|
2010-09-10 02:48:34 +08:00
|
|
|
; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
|
Using target specific nodes for shuffle nodes makes the mask
check more strict, breaking some cases not checked in the
testsuite, but also exposes some foldings not done before,
as this example:
movaps (%rdi), %xmm0
movaps (%rax), %xmm1
movaps %xmm0, %xmm2
movss %xmm1, %xmm2
shufps $36, %xmm2, %xmm0
now is generated as:
movaps (%rdi), %xmm0
movaps %xmm0, %xmm1
movlps (%rax), %xmm1
shufps $36, %xmm1, %xmm0
llvm-svn: 112753
2010-09-02 06:33:20 +08:00
|
|
|
|
|
|
|
define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
|
|
|
|
entry:
|
2011-02-22 15:21:51 +08:00
|
|
|
; CHECK: movaps ({{%rdi|%rcx}}), %xmm0
|
2011-08-11 01:45:17 +08:00
|
|
|
; CHECK: movaps %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: movss %xmm2, %xmm1
|
Using target specific nodes for shuffle nodes makes the mask
check more strict, breaking some cases not checked in the
testsuite, but also exposes some foldings not done before,
as this example:
movaps (%rdi), %xmm0
movaps (%rax), %xmm1
movaps %xmm0, %xmm2
movss %xmm1, %xmm2
shufps $36, %xmm2, %xmm0
now is generated as:
movaps (%rdi), %xmm0
movaps %xmm0, %xmm1
movlps (%rax), %xmm1
shufps $36, %xmm1, %xmm0
llvm-svn: 112753
2010-09-02 06:33:20 +08:00
|
|
|
; CHECK-NEXT: shufps $36, %xmm1, %xmm0
|
|
|
|
%0 = load <4 x i32>* undef, align 16
|
|
|
|
%1 = load <4 x i32>* %a0, align 16
|
|
|
|
%2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
|
|
|
ret <4 x i32> %2
|
|
|
|
}
|
|
|
|
|
2010-09-10 02:48:34 +08:00
|
|
|
define void @t01(double* %a0) nounwind ssp {
|
|
|
|
entry:
|
|
|
|
; CHECK_O0: movsd (%eax), %xmm0
|
|
|
|
; CHECK_O0: unpcklpd %xmm0, %xmm0
|
|
|
|
%tmp93 = load double* %a0, align 8
|
|
|
|
%vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1
|
|
|
|
store <2 x double> %vecinit94, <2 x double>* undef
|
|
|
|
ret void
|
|
|
|
}
|
2011-08-11 01:45:17 +08:00
|
|
|
|
|
|
|
define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
|
|
|
|
entry:
|
2011-10-30 05:23:04 +08:00
|
|
|
; CHECK: t02
|
This commit contains a few changes that had to go in together.
1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
(and also scalar_to_vector).
2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src).
Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B))
3. Optimize swizzles of shuffles: shuff(shuff(x, y), undef) -> shuff(x, y).
4. Fix an X86ISelLowering optimization which was very bitcast-sensitive.
Code which was previously compiled to this:
movd (%rsi), %xmm0
movdqa .LCPI0_0(%rip), %xmm2
pshufb %xmm2, %xmm0
movd (%rdi), %xmm1
pshufb %xmm2, %xmm1
pxor %xmm0, %xmm1
pshufb .LCPI0_1(%rip), %xmm1
movd %xmm1, (%rdi)
ret
Now compiles to this:
movl (%rsi), %eax
xorl %eax, (%rdi)
ret
llvm-svn: 153848
2012-04-02 03:31:22 +08:00
|
|
|
; CHECK: mov
|
|
|
|
; CHECK-NEXT: mov
|
|
|
|
; CHECK-NEXT: mov
|
|
|
|
; CHECK-NEXT: mov
|
|
|
|
; CHECK-NEXT: ret
|
2011-08-11 01:45:17 +08:00
|
|
|
%0 = bitcast <8 x i32>* %source to <4 x i32>*
|
|
|
|
%arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
|
|
|
|
%tmp2 = load <4 x i32>* %arrayidx, align 16
|
|
|
|
%tmp3 = extractelement <4 x i32> %tmp2, i32 0
|
|
|
|
%tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0
|
|
|
|
%arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1
|
|
|
|
%1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*
|
|
|
|
%tmp8 = load <4 x i32>* %1, align 16
|
|
|
|
%tmp9 = extractelement <4 x i32> %tmp8, i32 1
|
|
|
|
%tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1
|
|
|
|
store <2 x i32> %tmp11, <2 x i32>* %dest, align 8
|
|
|
|
ret void
|
|
|
|
}
|