2016-10-11 21:51:44 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2012-12-25 21:23:23 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
|
2010-01-24 08:24:43 +08:00
|
|
|
|
2011-10-17 15:07:51 +08:00
|
|
|
define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly {
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-LABEL: t0:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: # %bb.0:
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-NEXT: pxor %xmm2, %xmm2
|
|
|
|
; CHECK-NEXT: pcmpeqq %xmm2, %xmm0
|
|
|
|
; CHECK-NEXT: pcmpeqq %xmm2, %xmm1
|
2017-01-29 07:52:09 +08:00
|
|
|
; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; CHECK-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; CHECK-NEXT: pandn %xmm2, %xmm0
|
|
|
|
; CHECK-NEXT: movdqa %xmm0, (%rdi)
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-NEXT: retq
|
2010-01-24 08:24:43 +08:00
|
|
|
%cmp1 = icmp ne <2 x i64> %src1, zeroinitializer
|
|
|
|
%cmp2 = icmp ne <2 x i64> %src2, zeroinitializer
|
|
|
|
%t1 = and <2 x i1> %cmp1, %cmp2
|
|
|
|
%t2 = sext <2 x i1> %t1 to <2 x i64>
|
|
|
|
store <2 x i64> %t2, <2 x i64>* %dst
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-LABEL: t2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: # %bb.0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; CHECK-NEXT: movq %r9, %xmm1
|
|
|
|
; CHECK-NEXT: movq %r8, %xmm0
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-09-19 00:45:05 +08:00
|
|
|
; CHECK-NEXT: movq %rdx, %xmm1
|
|
|
|
; CHECK-NEXT: movq %rsi, %xmm2
|
|
|
|
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; CHECK-NEXT: movq %rcx, %xmm1
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
|
|
|
|
; CHECK-NEXT: pxor %xmm4, %xmm4
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; CHECK-NEXT: pcmpeqq %xmm4, %xmm1
|
2017-09-19 00:45:05 +08:00
|
|
|
; CHECK-NEXT: pcmpeqd %xmm5, %xmm5
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; CHECK-NEXT: pxor %xmm5, %xmm1
|
2017-09-19 00:45:05 +08:00
|
|
|
; CHECK-NEXT: pcmpeqq %xmm4, %xmm2
|
|
|
|
; CHECK-NEXT: pxor %xmm5, %xmm2
|
|
|
|
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-NEXT: pcmpeqq %xmm4, %xmm3
|
|
|
|
; CHECK-NEXT: pxor %xmm5, %xmm3
|
|
|
|
; CHECK-NEXT: pcmpeqq %xmm4, %xmm0
|
|
|
|
; CHECK-NEXT: pxor %xmm5, %xmm0
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
|
2017-09-19 00:45:05 +08:00
|
|
|
; CHECK-NEXT: andps %xmm2, %xmm0
|
2016-10-11 21:51:44 +08:00
|
|
|
; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; CHECK-NEXT: psllq $63, %xmm1
|
|
|
|
; CHECK-NEXT: psrad $31, %xmm1
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
|
|
|
; CHECK-NEXT: psllq $63, %xmm0
|
|
|
|
; CHECK-NEXT: psrad $31, %xmm0
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; CHECK-NEXT: movq %xmm0, 16(%rdi)
|
|
|
|
; CHECK-NEXT: movdqa %xmm1, (%rdi)
|
|
|
|
; CHECK-NEXT: retq
|
2010-01-24 08:24:43 +08:00
|
|
|
%cmp1 = icmp ne <3 x i64> %src1, zeroinitializer
|
|
|
|
%cmp2 = icmp ne <3 x i64> %src2, zeroinitializer
|
|
|
|
%t1 = and <3 x i1> %cmp1, %cmp2
|
|
|
|
%t2 = sext <3 x i1> %t1 to <3 x i64>
|
|
|
|
store <3 x i64> %t2, <3 x i64>* %dst
|
|
|
|
ret void
|
|
|
|
}
|