2016-01-16 23:25:02 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2016-12-13 07:16:35 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
|
2012-02-03 03:00:49 +08:00
|
|
|
|
|
|
|
define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-LABEL: trunc_64_32:
|
2016-12-13 07:16:35 +08:00
|
|
|
; CHECK: # BB#0:
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[x86] use a single shufps when it can save instructions
This is a tiny patch with a big pile of test changes.
This partially fixes PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
My motivating case looks like this:
- vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
- vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
- vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+ vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
And this happens several times in the diffs. For chips with domain-crossing penalties,
the instruction count and size reduction should usually overcome any potential
domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such
as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so
using shufps is a pure win.
So the test case diffs all appear to be improvements except one test in
vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate
zero elements and one test in combine-sra.ll where multiple uses prevent the expected
shuffle combining.
Differential Revision: https://reviews.llvm.org/D27692
llvm-svn: 289837
2016-12-16 02:03:38 +08:00
|
|
|
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2012-02-03 03:00:49 +08:00
|
|
|
%B = trunc <4 x i64> %A to <4 x i32>
|
|
|
|
ret <4 x i32>%B
|
|
|
|
}
|
2016-01-16 23:25:02 +08:00
|
|
|
|
2012-02-03 03:00:49 +08:00
|
|
|
define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-LABEL: trunc_32_16:
|
2016-12-13 07:16:35 +08:00
|
|
|
; CHECK: # BB#0:
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2012-02-03 03:00:49 +08:00
|
|
|
%B = trunc <8 x i32> %A to <8 x i16>
|
|
|
|
ret <8 x i16>%B
|
|
|
|
}
|
2016-01-16 23:25:02 +08:00
|
|
|
|
2013-10-24 05:06:07 +08:00
|
|
|
define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-LABEL: trunc_16_8:
|
2016-12-13 07:16:35 +08:00
|
|
|
; CHECK: # BB#0:
|
2016-01-16 23:25:02 +08:00
|
|
|
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2013-10-24 05:06:07 +08:00
|
|
|
%B = trunc <16 x i16> %A to <16 x i8>
|
|
|
|
ret <16 x i8> %B
|
|
|
|
}
|
2017-01-19 20:08:21 +08:00
|
|
|
|
|
|
|
|